In [36]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer

In [37]:
import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(os.path.abspath(src_dir))

from importlib import reload

import model_2.utils.data_loader

reload(model_2.utils.data_loader)

from model_2.utils.data_loader import DataEncoderNER

In [38]:
train_df = pd.read_csv('../data/dataset/ner_train.csv', sep=',')
val_df = pd.read_csv('../data/dataset/ner_valid.csv', sep=',')
test_df = pd.read_csv('../data/dataset/ner_test.csv', sep=',')

In [40]:
batch_size = 30
epochs = 2
learning_rate = 2e-5

device = "cpu"

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
bert_model = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased")
max_len = 128

In [41]:
le_sub_sentences = LabelEncoder()

classes = ["I", "O", "B", np.nan]
le_sub_sentences.fit(classes)

train_df['status'] = le_sub_sentences.transform(train_df['status'])
val_df['status'] = le_sub_sentences.transform(val_df['status'])
test_df['status'] = le_sub_sentences.transform(test_df['status'])

In [42]:
train_dataset = DataEncoderNER(train_df["sub_sentence"], train_df["status"], tokenizer, max_len)
val_dataset = DataEncoderNER(val_df["sub_sentence"], val_df["status"], tokenizer, max_len)

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

TypeError: DataEncoderNER.__init__() missing 1 required positional argument: 'max_len'

In [30]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, n_classes):
        super(BertForSequenceClassification, self).__init__()
        self.bert = bert_model
        self.drop = nn.Dropout(p=0.3)
        self.out_linear = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]

        output_depart = self.out_linear(self.drop(pooled_output))

        return output_depart

In [33]:
model = BertForSequenceClassification(len(le_sub_sentences.classes_))
# model.load_state_dict(torch.load("./processed/departure_arrival_model3_trained.pth"))
model = model.to(device)
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [34]:
def train_epoch(model, data_loader_train, data_loader_valid, loss_fn, optimizer, device, current_epoch):
    model = model.train()
    losses_train = []
    losses_valid = []
    correct_predictions_train = 0
    correct_predictions_valid = 0

    with tqdm(total=len(data_loader_train), desc=f"Epoch {current_epoch}", unit="batch") as pbar:
        for d in data_loader_train:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["status"].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss = loss_fn(outputs, labels)

            correct_predictions_train += (outputs.argmax(1) == labels).sum().item()
            losses_train.append(loss.item())

            loss.backward()
            optimizer.step()

            pbar.update(1)

    model = model.eval()

    for d in data_loader_valid:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels= d["status"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs, labels)

        correct_predictions_valid += (outputs.argmax(1) == labels).sum().item()
        losses_valid.append(loss.item())

    train_acc = correct_predictions_train / (2 * len(data_loader_train.dataset))
    train_loss = np.mean(losses_train)

    valid_acc = correct_predictions_valid / (2 * len(data_loader_valid.dataset))
    valid_loss = np.mean(losses_valid)

    return {"train_acc": train_acc, "train_loss": train_loss, "valid_acc": valid_acc, "valid_loss": valid_loss}

In [35]:
history = []

for epoch in range(epochs):
    results = train_epoch(
        model,
        train_loader,
        val_loader,
        loss_fn,
        optimizer,
        device,
        epoch + 1
    )
    print(results)
    history.append(results)

Epoch 1:   0%|          | 0/1167 [00:00<?, ?batch/s]

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/Users/danyleguy/anaconda3/envs/myenv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/danyleguy/anaconda3/envs/myenv/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/Users/danyleguy/Documents/Travail/EPITECH/MSc2/trip_advisor/local_repo/T-AIA-901_par_1/model/model_2/utils/data_loader.py", line 72, in __getitem__
    'sub_sentence': torch.tensor(self.sub_sentences[idx], dtype=torch.long)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: new(): invalid data type 'str'
