In [24]:
import pandas as pd
import numpy as np

import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel

In [25]:
import sys
import os

src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(os.path.abspath(src_dir))

from importlib import reload

import src.data.dataset

reload(src.data.dataset)

from src.data.dataset import TrainDataset

In [26]:
train_df = pd.read_csv('./data/dataset/train.csv', nrows=30)
valid_df = pd.read_csv('./data/dataset/valid.csv', nrows=20)
test_df = pd.read_csv('./data/dataset/test.csv', nrows=20)

In [27]:
tokenizer = BertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
max_len = 128

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [28]:
le_depart = LabelEncoder()
le_arrival = LabelEncoder()

# Combinez les colonnes 'departure' et 'arrival' de tous les ensembles de données
all_departures = pd.concat([train_df['departure'], valid_df['departure'], test_df['departure']])
all_arrivals = pd.concat([train_df['arrival'], valid_df['arrival'], test_df['arrival']])

# Ajustez les encodeurs sur l'ensemble combiné
le_depart.fit(all_departures)
le_arrival.fit(all_arrivals)

train_df['departure'] = le_depart.transform(train_df['departure'])
train_df['arrival'] = le_arrival.transform(train_df['arrival'])

# Transformez les ensembles de validation et de test en utilisant les encodeurs ajustés
valid_df['departure'] = le_depart.transform(valid_df['departure'])
valid_df['arrival'] = le_arrival.transform(valid_df['arrival'])

test_df['departure'] = le_depart.transform(test_df['departure'])
test_df['arrival'] = le_arrival.transform(test_df['arrival'])

In [29]:
train_dataset = TrainDataset(
    train_df["sentence"], train_df["departure"], train_df["arrival"], tokenizer, max_len)
val_dataset = TrainDataset(
    valid_df["sentence"], valid_df["departure"], valid_df["arrival"], tokenizer, max_len)
test_dataset = TrainDataset(
    test_df["sentence"], test_df["departure"], test_df["arrival"], tokenizer, max_len)

train_loader = DataLoader(
    train_dataset, batch_size=2, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, pin_memory=True)

In [30]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, n_depart, n_arrival):
        super(BertForSequenceClassification, self).__init__()
        self.bert = BertModel.from_pretrained('distilbert-base-multilingual-cased')
        self.drop = nn.Dropout(p=0.3)
        self.out_depart = nn.Linear(self.bert.config.hidden_size, n_depart)
        self.out_arrival = nn.Linear(self.bert.config.hidden_size, n_arrival)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        output_depart = self.out_depart(self.drop(pooled_output))
        output_arrival = self.out_arrival(self.drop(pooled_output))
        return output_depart, output_arrival

In [31]:
model = BertForSequenceClassification(len(le_depart.classes_), len(le_arrival.classes_))
model = model.to('cpu')

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias',

In [32]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to('cpu')

def train_epoch(model, data_loader, loss_fn, optimizer, device, current_epoch):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm.tqdm(data_loader, desc=f"Epoch {current_epoch}", unit="batch"):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels_depart = d["departure"].to(device)
        labels_arrival = d["arrival"].to(device)

        optimizer.zero_grad()

        outputs_depart, outputs_arrival = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss_depart = loss_fn(outputs_depart, labels_depart)
        loss_arrival = loss_fn(outputs_arrival, labels_arrival)
        loss = loss_depart + loss_arrival

        correct_predictions += (outputs_depart.argmax(1) == labels_depart).sum().item()
        correct_predictions += (outputs_arrival.argmax(1) == labels_arrival).sum().item()

        losses.append(loss.item())

        loss.backward()
        optimizer.step()

    return correct_predictions / (2 * len(data_loader.dataset)), np.mean(losses)

In [48]:
sentence = f"Je souhaite aller de {le_depart.classes_[0]} à {le_arrival.classes_[0]}."
# Tokenisez la phrase
inputs = tokenizer(sentence, return_tensors="pt")

# Récupérez les input_ids et attention_mask
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]


In [63]:
print(sentence)

with torch.no_grad():  # Désactivez la grad pour économiser de la mémoire
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# print(torch.max(outputs[0], 1))
depart = le_depart.inverse_transform(torch.max(outputs[0], 1).indices)
arrival = le_arrival.inverse_transform(torch.max(outputs[1], 1).indices)
print(f"Depart: {depart[0]}, Arrival: {arrival[0]}")

Je souhaite aller de Aix-en-Provence à Auch.
Depart: Raves-Ban-de-Laveline, Arrival: Gérard


In [47]:
history = []
epochs = 10

for epoch in range(epochs):
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        'cpu',
        epoch + 1
    )
    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train loss {train_loss} accuracy {train_acc}')

Epoch 1/10


Training...: 100%|████████████████████████████████████| 15/15 [00:35<00:00,  2.39s/batch]


Epoch 1/10
Train loss 8.939369519551596 accuracy 0.0
Epoch 2/10


Training...: 100%|████████████████████████████████████| 15/15 [00:32<00:00,  2.18s/batch]


Epoch 2/10
Train loss 8.144484519958496 accuracy 0.0
Epoch 3/10


Training...: 100%|████████████████████████████████████| 15/15 [00:34<00:00,  2.30s/batch]


Epoch 3/10
Train loss 7.934565194447836 accuracy 0.03333333333333333
Epoch 4/10


Training...: 100%|████████████████████████████████████| 15/15 [00:32<00:00,  2.14s/batch]


Epoch 4/10
Train loss 7.904856077829996 accuracy 0.0
Epoch 5/10


Training...: 100%|████████████████████████████████████| 15/15 [00:34<00:00,  2.29s/batch]


Epoch 5/10
Train loss 7.789430936177571 accuracy 0.08333333333333333
Epoch 6/10


Training...: 100%|████████████████████████████████████| 15/15 [00:32<00:00,  2.19s/batch]


Epoch 6/10
Train loss 7.643130429585775 accuracy 0.016666666666666666
Epoch 7/10


Training...: 100%|████████████████████████████████████| 15/15 [00:32<00:00,  2.19s/batch]


Epoch 7/10
Train loss 7.494558461507162 accuracy 0.03333333333333333
Epoch 8/10


Training...: 100%|████████████████████████████████████| 15/15 [00:30<00:00,  2.05s/batch]


Epoch 8/10
Train loss 7.518856779734294 accuracy 0.0
Epoch 9/10


Training...: 100%|████████████████████████████████████| 15/15 [00:32<00:00,  2.14s/batch]


Epoch 9/10
Train loss 7.430374495188395 accuracy 0.016666666666666666
Epoch 10/10


Training...: 100%|████████████████████████████████████| 15/15 [00:32<00:00,  2.17s/batch]

Epoch 10/10
Train loss 7.386019484202067 accuracy 0.05



