In [1]:
import pandas as pd
import numpy as np

import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer

In [2]:
import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(os.path.abspath(src_dir))

from importlib import reload

import src.data.dataset

reload(src.data.dataset)

from src.data.dataset import TrainDataset

In [3]:
train_df = pd.read_csv('./data/dataset/train.csv', nrows=30)
valid_df = pd.read_csv('./data/dataset/valid.csv', nrows=20)
test_df = pd.read_csv('./data/dataset/test.csv', nrows=20)

In [4]:
# tokenizer = BertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")

max_len = 128

In [5]:
le_depart = LabelEncoder()
le_arrival = LabelEncoder()

# Combinez les colonnes 'departure' et 'arrival' de tous les ensembles de données
all_departures = pd.concat([train_df['departure'], valid_df['departure'], test_df['departure']])
all_arrivals = pd.concat([train_df['arrival'], valid_df['arrival'], test_df['arrival']])

# Ajustez les encodeurs sur l'ensemble combiné
le_depart.fit(all_departures)
le_arrival.fit(all_arrivals)

train_df['departure'] = le_depart.transform(train_df['departure'])
train_df['arrival'] = le_arrival.transform(train_df['arrival'])

# Transformez les ensembles de validation et de test en utilisant les encodeurs ajustés
valid_df['departure'] = le_depart.transform(valid_df['departure'])
valid_df['arrival'] = le_arrival.transform(valid_df['arrival'])

test_df['departure'] = le_depart.transform(test_df['departure'])
test_df['arrival'] = le_arrival.transform(test_df['arrival'])

In [6]:
train_dataset = TrainDataset(
    train_df["sentence"], train_df["departure"], train_df["arrival"], tokenizer, max_len)
val_dataset = TrainDataset(
    valid_df["sentence"], valid_df["departure"], valid_df["arrival"], tokenizer, max_len)
test_dataset = TrainDataset(
    test_df["sentence"], test_df["departure"], test_df["arrival"], tokenizer, max_len)

train_loader = DataLoader(
    train_dataset, batch_size=2, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, pin_memory=True)

In [7]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, n_depart, n_arrival):
        super(BertForSequenceClassification, self).__init__()
        # self.bert = BertModel.from_pretrained('distilbert-base-multilingual-cased')
        self.bert = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased")
        self.drop = nn.Dropout(p=0.3)
        self.out_depart = nn.Linear(self.bert.config.hidden_size, n_depart)
        self.out_arrival = nn.Linear(self.bert.config.hidden_size, n_arrival)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        output_depart = self.out_depart(self.drop(pooled_output))
        output_arrival = self.out_arrival(self.drop(pooled_output))
        return output_depart, output_arrival

In [8]:
model = BertForSequenceClassification(len(le_depart.classes_), len(le_arrival.classes_))
model = model.to('cpu')

In [9]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to('cpu')

def train_epoch(model, data_loader, loss_fn, optimizer, device, current_epoch):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm.tqdm(data_loader, desc=f"Epoch {current_epoch}", unit="batch"):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels_depart = d["departure"].to(device)
        labels_arrival = d["arrival"].to(device)

        optimizer.zero_grad()

        outputs_depart, outputs_arrival = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss_depart = loss_fn(outputs_depart, labels_depart)
        loss_arrival = loss_fn(outputs_arrival, labels_arrival)
        loss = loss_depart + loss_arrival

        correct_predictions += (outputs_depart.argmax(1) == labels_depart).sum().item()
        correct_predictions += (outputs_arrival.argmax(1) == labels_arrival).sum().item()

        losses.append(loss.item())

        loss.backward()
        optimizer.step()

    return correct_predictions / (2 * len(data_loader.dataset)), np.mean(losses)

In [10]:
sentence = f"Je souhaite aller de {le_depart.classes_[0]} à {le_arrival.classes_[0]}."
# Tokenisez la phrase
inputs = tokenizer(sentence, return_tensors="pt")

# Récupérez les input_ids et attention_mask
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]


In [11]:
print(sentence)

with torch.no_grad():  # Désactivez la grad pour économiser de la mémoire
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# print(torch.max(outputs[0], 1))
depart = le_depart.inverse_transform(torch.max(outputs[0], 1).indices)
arrival = le_arrival.inverse_transform(torch.max(outputs[1], 1).indices)
print(f"Depart: {depart[0]}, Arrival: {arrival[0]}")

Je souhaite aller de Aix-en-Provence à Auch.
Depart: Évreux-Embranchement, Arrival: Nogent-le-Rotrou


In [12]:
history = []
epochs = 2

for epoch in range(epochs):
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        'cpu',
        epoch + 1
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    history.append({'epoch': epoch + 1, 'train_loss': train_loss, 'train_acc': train_acc})

Epoch 1:   0%|                                                   | 0/15 [00:00<?, ?batch/s]

Epoch 1: 100%|██████████████████████████████████████████| 15/15 [00:35<00:00,  2.37s/batch]


Train loss 8.641998545328777 accuracy 0.0


Epoch 2: 100%|██████████████████████████████████████████| 15/15 [00:35<00:00,  2.37s/batch]

Train loss 8.270745627085368 accuracy 0.0



