# Report of the best model (BERT+Linear)

Running the following cells will train the model using settings that are shown.

## Macro-F1 0.8677

In [None]:
import torch

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from lightning.pytorch.loggers import CSVLogger, CometLogger, TensorBoardLogger
from lightning.pytorch.profilers import PyTorchProfiler

from dvclive.lightning import DVCLiveLogger

from datamodule import AutoTokenizerDataModule
from module import CustomModel, LinearBAModel, LinearBEModel
from utils import create_dirs
from config import Config, DataModuleConfig, ModuleConfig

from typing import Optional, Union, List, Tuple
import numpy as np

In [None]:
pl.seed_everything(59631546)

In [None]:
from huggingface_hub import login
import os
token = os.getenv('HUG_FACE_TOKEN')
login(token)

First, let's configure some basic settings

In [None]:
class ModuleConfig:
    model_name: str = "Twitter/twhin-bert-base" # change this to use a different pretrained checkpoint and tokenizer
    # model_name: str = "nvidia/NV-Embed-v2" # change this to use a different pretrained checkpoint and tokenizer
    learning_rate: float = 2e-5
    learning_rate_bert: float = 1.2e-5
    learning_rate_lstm: float = 7e-5
    finetuned: str = "checkpoints/twhin-bert-base-finetuned" # change this to use a different pretrained checkpoint and tokenizer
    max_length: int = 128
    attention_probs_dropout: float = 0.1
    classifier_dropout: Optional[float] = None
    warming_steps: int = 100
    focal_gamma: float = 2.0

In [None]:
# model and dataset
model_name = ModuleConfig.model_name # change this to use a different pretrained checkpoint and tokenizer

max_length = ModuleConfig.max_length
lr = ModuleConfig.learning_rate
dataset_name = DataModuleConfig.dataset_name
batch_size = DataModuleConfig.batch_size

# paths
cache_dir = Config.cache_dir
log_dir = Config.log_dir
ckpt_dir = Config.ckpt_dir
prof_dir = Config.prof_dir
perf_dir = Config.perf_dir
# creates dirs to avoid failure if empty dir has been deleted
create_dirs([cache_dir, log_dir, ckpt_dir, prof_dir, perf_dir])

# set matmul precision
# see https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html
torch.set_float32_matmul_precision("medium")

Now, we can define our LightningDataModule, which will be used by Trainer for its DataLoaders

In [None]:
lit_datamodule = AutoTokenizerDataModule(
    model_name=model_name,
    dataset_name=dataset_name,
    cache_dir=cache_dir,
    batch_size=batch_size,
    max_length=max_length
)

In [None]:
#lit_datamodule.clear_custom_cache()

In [None]:
lit_datamodule.prepare_data()

In [None]:
lit_datamodule.setup("fit")

and our custom LightningModule with ResNet

In [None]:
#lit_datamodule.setup("test")

In [None]:
be_linear_model = LinearBEModel(learning_rate=lr)

next - we are going to define some common callbacks, and our most basic logger - CSVLogger.

EarlyStopping callback helps us to end training early if a convergence criteria is met before the max-iteration setting is reached.

ModelCheckpoint saves the model periodically, and after training finishes, uses best_model_path to retrieve the path to the best checkpoint file and best_model_score to retrieve its score.

In [None]:
callbacks = [
    EarlyStopping(monitor="val_f1", mode="max", patience=3),
    ModelCheckpoint(
        dirpath=ckpt_dir,
        monitor="val_f1",
        filename="model",
        save_top_k=3,
        mode="max",
        save_weights_only=True,
    ),

    #LoRACheckpoint(
    #    monitor='val_f1',  # モニターするメトリクス
    #    filename='model-{epoch:02d}-{val_acc:.2f}',
    #    save_top_k=3,       # 保存するトップkモデルの数
    #    mode='max',         # 'max'は検証精度が最大のときに保存
    #),
    LearningRateMonitor(logging_interval='step'),

    ]

In [None]:
logger = CSVLogger(
    save_dir=log_dir,
    name="csv-logs",
)

Finally – we create our Trainer and pass in our flags (settings), the callbacks and loggers.  Then we call fit!

In [None]:
def print_seed():
    torch_init_seed = torch.initial_seed()
    torch_cuda_seed = torch.cuda.initial_seed()
    numpy_seed = np.random.get_state()[1][0]

    print(f"pytorch seed: {torch_init_seed}")
    print(f"cuda seed: {torch_cuda_seed}")
    print(f"numpy seed: {numpy_seed}")

In [None]:
pl.seed_everything(59631546)

In [None]:
print_seed()

In [None]:
lit_trainer = pl.Trainer(
    accelerator="auto",
    devices="auto",
    strategy="auto",
    precision="16-mixed",
    max_epochs=8,
    deterministic=True,
    logger=[logger, CometLogger(api_key="YOUR_COMET_API_KEY"), DVCLiveLogger(save_dvc_exp=True)],
    callbacks=callbacks,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
model = be_linear_model.load_from_checkpoint(r"e:\bert-twetter-disaster-model-trained\checkpoints\model-v44.ckpt")

In [None]:
model

In [None]:
from sklearn.metrics import classification_report

#lit_trainer.validate(model=model, datamodule=lit_datamodule)
# Run validation
# results = lit_trainer.validate(model=model, datamodule=lit_datamodule)
results = lit_trainer.validate(model=model, datamodule=lit_datamodule)

In [None]:
results

In [None]:
y_true = [label for batch in lit_datamodule.val_dataloader() for label in batch["label"].numpy()]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.eval()
from tqdm.notebook import tqdm
# Prepare list for storing inf results
y_pred = []

# Disable grad for inf
with torch.no_grad():
    for batch in tqdm(lit_datamodule.val_dataloader()):
        #print(batch)
        input_ids = batch[model.input_key].to(model.device)
        attention_mask = batch[model.mask_key].to(model.device)
        #token_type_ids = batch["token_type_ids"]
        
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        
        #print(torch.sigmoid(logits[:, 1]))
        preds = (torch.sigmoid(logits)> 0.5).int()

        y_pred.extend(preds.cpu().numpy())

In [None]:
label_order = [
    "non_disaster",
    "disaster",
    "flood",
    "extreme_rain",
    "earthquake",
    "typhoon",
    "landslide",
    "tsunami",
    "volcano",
    "wildfire",
    "informative",
    "non_informative"
]


In [None]:
report = classification_report(y_true, y_pred, target_names=label_order, digits=4)

In [None]:
print(report)

In [None]:
from plot_classification_report import plot_classification_report

In [None]:
report = '''
Fill in the classification report here
'''

In [None]:
a = plot_classification_report(report)