# 05. Modeling - Nubank AI Core Transaction Dataset Interview Project

In this section we will train our model with different hyperparameters and compare the results.

In [1]:
import os
import argparse
import logging

from transformers import (
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    set_seed,
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from nubert.datasets import NuDataset
from nubert.config import NubertPreTrainConfig, TrainerConfig

In [2]:
def split_dataset(dataset, test_size=0.1, val_size=0.1, seed=42):
    train_val, test = train_test_split(dataset, test_size=test_size, random_state=seed)    
    train, val = train_test_split(train_val, test_size=val_size / (1 - test_size), random_state=seed)
    
    return train, val, test

def create_hf_dataset(data):
    return Dataset.from_dict({"input_ids": data})

def resize_model_embeddings(model, tokenizer):
    """Resize the model's embeddings to match the tokenizer's vocabulary size."""
    model.resize_token_embeddings(len(tokenizer))
    return model

In [3]:
import gc
import torch
import wandb

def train_model(
    dataset,
    config: NubertPreTrainConfig,
    ):
    model = AutoModelForMaskedLM.from_pretrained(config.model_name)
    tokenizer = dataset.tokenizer.base_tokenizer

    tokenizer.save_pretrained(config.trainer.output_dir)
    model = resize_model_embeddings(model, tokenizer)

    train_data, val_data, test_data = split_dataset(dataset.data)

    train_dataset = create_hf_dataset(train_data)
    val_dataset = create_hf_dataset(val_data)
    test_dataset = create_hf_dataset(test_data)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
    
    training_args = TrainingArguments(
        **config.trainer.model_dump()
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()

    trainer.save_model()
    tokenizer.save_pretrained(config.trainer.output_dir)
    wandb.finish()
    del model
    gc.collect()
    torch.cuda.empty_cache()


In [4]:
trainer_config = TrainerConfig(
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
)

config = NubertPreTrainConfig(
    dataset_path = "/notebooks/nubank/nubert/analyses/nubank-2013-2014",
    file_name = "nubank_raw",
    num_transactions = 5,
    stride = 1,
    num_bins = 20,
    trainer=trainer_config,
)

# full_dataset = NuDataset.from_config(config)

In [5]:
# os.environ["WANDB_PROJECT"] = "nubert"
# os.environ["WANDB_LOG_MODEL"] = "end"


# num_transactions_to_test = [7]
# stride_to_test = [2]
# num_bins_to_test = [15, 20]

# for num_transactions in num_transactions_to_test:
#     for stride in stride_to_test:
#         for num_bins in num_bins_to_test:
#             trainer_config = TrainerConfig(
#                 per_device_train_batch_size = 64,
#                 per_device_eval_batch_size = 64,
#             )
#             config = NubertPreTrainConfig(
#                 dataset_path = "/notebooks/nubank/nubert/analyses/nubank-2013-2014",
#                 file_name = "nubank_raw",
#                 num_transactions = num_transactions,
#                 stride = stride,
#                 num_bins = num_bins,
#                 trainer=trainer_config,
#             )
#             full_dataset = NuDataset.from_config(config)
#             train_model(dataset=full_dataset, config=config)

In [None]:
os.environ["WANDB_PROJECT"] = "nubert"
os.environ["WANDB_LOG_MODEL"] = "end"


num_transactions_to_test = [10]
stride_to_test = [2]
num_bins_to_test = [15, 20]
randomized_to_test = [True, False]

for num_transactions in num_transactions_to_test:
    for stride in stride_to_test:
        for num_bins in num_bins_to_test:
            for randomize_column_order in randomized_to_test:
                trainer_config = TrainerConfig(
                    per_device_train_batch_size = 64,
                    per_device_eval_batch_size = 64,
                )
                config = NubertPreTrainConfig(
                    dataset_path = "/notebooks/nubank/nubert/analyses/nubank-2013-2014",
                    file_name = "nubank_raw",
                    num_transactions = num_transactions,
                    stride = stride,
                    num_bins = num_bins,
                    trainer=trainer_config,
                    randomize_column_order = randomize_column_order,
                )
                full_dataset = NuDataset.from_config(config)
                train_model(dataset=full_dataset, config=config)

  df = pd.read_csv(path.join(root, f"{fname}.csv"))
  df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

100%|██████████| 110/110 [30:54<00:00, 16.86s/it]  


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mrafaelmcelente[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,0.176,0.173036


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


VBox(children=(Label(value='86.992 MB of 255.730 MB uploaded\r'), FloatProgress(value=0.3401735202757312, max=…

0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇█
train/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▇▇▇▇▇███
train/grad_norm,█▇▅▄▃▃▃▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▂▂▂▁▁▁▁▂▂▁▁▂▂▁
train/learning_rate,████▇▇▇▇▇▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,██▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▂▁▁▁▂▂▁▁▂▁▁▁▁

0,1
eval/loss,0.17304
eval/runtime,121.4844
eval/samples_per_second,352.012
eval/steps_per_second,5.507
total_flos,4.534091212924877e+16
train/epoch,1.0
train/global_step,5346.0
train/grad_norm,0.46306
train/learning_rate,0.0
train/loss,0.176


  df = pd.read_csv(path.join(root, f"{fname}.csv"))
  df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
100%|██████████| 110/110 [29:55<00:00, 16.33s/it]  
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,0.1776,0.170246


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


VBox(children=(Label(value='72.758 MB of 255.729 MB uploaded\r'), FloatProgress(value=0.28451082002111416, max…

0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,█▆▅▂▃▃▃▃▂▃▃▂▂▂▃▂▂▂▂▂▅▂▂▂▂▂▂▂▁▁▂▂▂▁▂▁▁▂▁▂
train/learning_rate,█████▇▇▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▃▂▂▂▂▂▂▂▁▁▁
train/loss,█▆▆▆▆▅▄▄▃▃▃▂▃▃▃▃▃▃▂▂▁▂▁▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.17025
eval/runtime,123.6607
eval/samples_per_second,345.817
eval/steps_per_second,5.41
total_flos,4.534091212924877e+16
train/epoch,1.0
train/global_step,5346.0
train/grad_norm,0.55395
train/learning_rate,0.0
train/loss,0.1776


  df = pd.read_csv(path.join(root, f"{fname}.csv"))
  df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
100%|██████████| 110/110 [30:58<00:00, 16.89s/it]  
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,0.1804,0.181425


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


VBox(children=(Label(value='64.633 MB of 255.729 MB uploaded\r'), FloatProgress(value=0.25273891991492303, max…

0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇███
train/grad_norm,█▇▅▇▃▃▄▅▃▃▂▃▃▃▂▂▂▂▁▂▂▂▄▃▂▂▂▂▄▂▁▂▁▂▄▁▃▂▁▁
train/learning_rate,██████▇▇▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▂▂▂▂▂▂▂▂▁▁
train/loss,█▇▄▃▃▃▃▂▃▂▂▂▂▂▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.18143
eval/runtime,123.8838
eval/samples_per_second,345.195
eval/steps_per_second,5.4
total_flos,4.534091212924877e+16
train/epoch,1.0
train/global_step,5346.0
train/grad_norm,0.52447
train/learning_rate,0.0
train/loss,0.1804


  df = pd.read_csv(path.join(root, f"{fname}.csv"))
  df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
100%|██████████| 110/110 [30:48<00:00, 16.80s/it]  
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,0.1834,0.176603


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


VBox(children=(Label(value='62.430 MB of 255.729 MB uploaded\r'), FloatProgress(value=0.24412386117087315, max…

0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▄▃▃▂▂▂▂▂▂▂▂▂▃▄▂▁▂▁▁▁▁▁▂▁▁▁▂▂▁▁▁▁▁▂▁▁▁▁▁
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▂▂▂▂▂▂▁
train/loss,█▇▆▆▅▅▄▄▄▄▃▃▃▃▄▃▃▃▂▃▂▂▃▂▂▂▁▁▂▁▂▁▂▁▂▂▁▂▂▂

0,1
eval/loss,0.1766
eval/runtime,124.5818
eval/samples_per_second,343.26
eval/steps_per_second,5.37
total_flos,4.534091212924877e+16
train/epoch,1.0
train/global_step,5346.0
train/grad_norm,0.52085
train/learning_rate,0.0
train/loss,0.1834


  df = pd.read_csv(path.join(root, f"{fname}.csv"))
  df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
100%|██████████| 110/110 [31:24<00:00, 17.13s/it]  
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
