In [1]:
from pytorch_modules.model import DistilBERTClass
from pytorch_modules.dataset import MultiLabelDataset
import torch
from pytorch_modules import config
import pandas as pd
from transformers import DistilBertTokenizer
from torch.utils.data import random_split
from torch.utils.data import DataLoader
from pytorch_modules.utils import train, save_model

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
df = pd.read_csv(config.DATA_DIR)
tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased", truncation=True, do_lower_case=True
)
dataset = MultiLabelDataset(
    dataframe=df, tokenizer=tokenizer, max_len=config.MAX_LEN
)
train_set, val_set = random_split(
    dataset, [0.8, 0.2], generator=torch.Generator().manual_seed(42)
)

train_dataloader = DataLoader(
    dataset=train_set,
    batch_size=config.BATCH_SIZE,
    shuffle=True,
    num_workers=config.NUM_WORKERS,
)

val_dataloader = DataLoader(
    dataset=val_set,
    batch_size=config.BATCH_SIZE,
    shuffle=True,
    num_workers=config.NUM_WORKERS,
)

model = DistilBERTClass()
model.to(device)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=config.LEARNING_RATE)

results = train(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=val_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    epochs=config.EPOCHS,
    device=device,
)

save_model(model=model, target_dir="models/", model_name="test_model.pt")

  0%|          | 0/1 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Epoch 0:   0%|          | 0/12766 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Testing Epoch 0:   0%|          | 0/3192 [00:00<?, ?it/s]



Epoch: 1 | train_loss: 0.0898 | train_acc: 0.9654 | test_loss: 0.0798 | test_acc: 0.9700 | train_epoch_time: 8909.3768 | test_epoch_time: 835.0987
[INFO] Saving model to: models/test_model.pt
