## 06. Finetuning the pre-trained NuBERT model for Amount prediction.

In [1]:
import os
import argparse
import logging

from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
)
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from nubert.datasets import AmountDataset
from nubert.config import AmountConfig, TrainerConfig

In [2]:

def split_dataset(dataset, val_size=0.1, seed=42):
    train, val = train_test_split(dataset, test_size=val_size, random_state=seed)
    
    return train, val

def create_hf_dataset(data):
    input_ids = [example["input_ids"] for example in data]
    labels = [example["label"] for example in data]
    return Dataset.from_dict({"input_ids": input_ids, "labels": labels})


In [3]:
import gc
import torch
import wandb

def train_model(
    dataset,
    config: AmountConfig,
    num_labels: int,
    ):
    model = AutoModelForSequenceClassification.from_pretrained(
        config.model_name,
        num_labels=num_labels,
    )
    tokenizer = dataset.tokenizer.base_tokenizer

    train_data, val_data = split_dataset(dataset)

    train_dataset = create_hf_dataset(train_data)
    val_dataset = create_hf_dataset(val_data)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    training_args = TrainingArguments(
        **config.trainer.model_dump()
    )
    
    torch.set_float32_matmul_precision("medium")
    
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()

    trainer.save_model()
    tokenizer.save_pretrained(config.trainer.output_dir)
    wandb.finish()
    del model
    gc.collect()
    torch.cuda.empty_cache()


In [4]:
os.environ["WANDB_PROJECT"] = "amount"
os.environ["WANDB_LOG_MODEL"] = "end"


num_transactions_to_test = [5, 7, 10]
stride_to_test = [1, 2]
num_bins_to_test = [15, 20]
randomized_to_test = [False, True]

for num_transactions in num_transactions_to_test:
    for stride in stride_to_test:
        for num_bins in num_bins_to_test:
            for randomize_column_order in randomized_to_test:
                model_name = f"nubert-distil-transactions-{num_transactions}-stride-{stride}-randomize-{str(randomize_column_order)}-bins-{num_bins}"
                model_name = os.path.join("/notebooks/nubank/models/nubert", model_name)
                trainer_config = TrainerConfig(
                    per_device_train_batch_size = 64,
                    per_device_eval_batch_size = 64,
                    output_dir = "/notebooks/nubank/models/amount"
                )
                config = AmountConfig(
                    model_name = model_name,
                    dataset_path = "/notebooks/nubank/nubert/analyses/amount-2014-2014",
                    file_name = "amount_raw",
                    num_transactions = num_transactions,
                    stride = stride,
                    num_bins = num_bins,
                    trainer=trainer_config,
                )
                full_dataset = AmountDataset.from_config(config)
                train_model(dataset=full_dataset, config=config, num_labels=num_bins)

  df = pd.read_csv(path.join(root, f"{fname}.csv"))
100%|██████████| 113/113 [11:33<00:00,  6.14s/it]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /notebooks/nubank/models/nubert/nubert-distil-transactions-7-stride-1-randomize-False-bins-15 and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after pa

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,1.3525,1.412897


VBox(children=(Label(value='80.228 MB of 255.652 MB uploaded\r'), FloatProgress(value=0.31381597855168114, max…

0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▁▆▄▄▂▆▃▂▁▃▄█▄▃▂▁▃▃▂▃▅▂▅▃▃▃▃▃▃▂▃▃▂▄▄▃▄▅▃▃
train/learning_rate,█████▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
train/loss,█▅▅▄▄▃▃▄▃▃▂▂▂▃▃▂▂▃▂▂▃▃▂▂▂▂▂▂▂▂▂▁▂▂▃▁▁▂▂▁

0,1
eval/loss,1.4129
eval/runtime,29.2368
eval/samples_per_second,732.022
eval/steps_per_second,11.458
total_flos,1.943389417075164e+16
train/epoch,1.0
train/global_step,3010.0
train/grad_norm,4.37164
train/learning_rate,0.0
train/loss,1.3525


## Baseline: Fine-tuning without the pre-training step

In [5]:
os.environ["WANDB_PROJECT"] = "amount"
os.environ["WANDB_LOG_MODEL"] = "end"


num_transactions = 7
stride = 1
num_bins = 15
randomized = False


model_name = "distilbert/distilbert-base-uncased"
trainer_config = TrainerConfig(
    per_device_train_batch_size = 128,
    per_device_eval_batch_size = 128,
    output_dir = "/notebooks/nubank/models/amount"
)
config = AmountConfig(
    model_name = model_name,
    dataset_path = "/notebooks/nubank/nubert/analyses/amount-2014-2014",
    file_name = "amount_raw",
    num_transactions = num_transactions,
    stride = stride,
    num_bins = num_bins,
    trainer=trainer_config,
)
full_dataset = AmountDataset.from_config(config)
train_model(dataset=full_dataset, config=config, num_labels=num_bins)