In [387]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer

In [369]:
import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(os.path.abspath(src_dir))

from importlib import reload

import src.data.dataset

reload(src.data.dataset)

from src.data.dataset import TrainDataset

In [370]:
train_df = pd.read_csv('./data/dataset/train.csv')
valid_df = pd.read_csv('./data/dataset/valid.csv')
# test_df = pd.read_csv('./data/dataset/test.csv')

In [371]:
new_train_df = pd.DataFrame()
new_valid_df = pd.DataFrame()

for i in range(10):
    train_df_subset = train_df[train_df["arrival_train_station_id"] == i].head(n=10)
    new_train_df = pd.concat([new_train_df, train_df_subset])
    valid_df_subset = valid_df[valid_df["arrival_train_station_id"] == i].head(n=2)
    new_valid_df = pd.concat([new_valid_df, valid_df_subset])

train_df = new_train_df.reset_index(drop=True)
valid_df = new_valid_df.reset_index(drop=True)

In [372]:
# tokenizer = BertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")

max_len = 128

In [373]:
le_depart = LabelEncoder()
le_arrival = LabelEncoder()

# Combinez les colonnes 'departure' et 'arrival' de tous les ensembles de données
all_departures = pd.concat([train_df['departure'], valid_df['departure']])
all_arrivals = pd.concat([train_df['arrival'], valid_df['arrival']])

# Ajustez les encodeurs sur l'ensemble combiné
le_depart.fit(all_departures)
le_arrival.fit(all_arrivals)

train_df['departure'] = le_depart.transform(train_df['departure'])
train_df['arrival'] = le_arrival.transform(train_df['arrival'])

# Transformez les ensembles de validation et de test en utilisant les encodeurs ajustés
valid_df['departure'] = le_depart.transform(valid_df['departure'])
valid_df['arrival'] = le_arrival.transform(valid_df['arrival'])

In [374]:
train_dataset = TrainDataset(
    train_df["sentence"], train_df["departure"], train_df["arrival"], tokenizer, max_len)
val_dataset = TrainDataset(
    valid_df["sentence"], valid_df["departure"], valid_df["arrival"], tokenizer, max_len)

train_loader = DataLoader(
    train_dataset, batch_size=1, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, pin_memory=True)

In [375]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, n_depart, n_arrival):
        super(BertForSequenceClassification, self).__init__()
        # self.bert = BertModel.from_pretrained('distilbert-base-multilingual-cased')
        self.bert = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased")
        self.drop = nn.Dropout(p=0.3)
        self.out_depart = nn.Linear(self.bert.config.hidden_size, n_depart)
        self.out_arrival = nn.Linear(self.bert.config.hidden_size, n_arrival)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        output_depart = self.out_depart(self.drop(pooled_output))
        output_arrival = self.out_arrival(self.drop(pooled_output))
        return output_depart, output_arrival

In [376]:
model = BertForSequenceClassification(len(le_depart.classes_), len(le_arrival.classes_))
model = model.to('cpu')

In [377]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to('cpu')

def train_epoch(model, data_loader_train, data_loader_valid, loss_fn, optimizer, device, current_epoch):
    model = model.train()
    losses_train = []
    losses_valid = []
    correct_predictions_train = 0
    correct_predictions_valid = 0

    for d in tqdm.tqdm(data_loader_train, desc=f"Epoch {current_epoch}", unit="batch"):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels_depart = d["departure"].to(device)
        labels_arrival = d["arrival"].to(device)

        optimizer.zero_grad()

        outputs_depart, outputs_arrival = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss_depart = loss_fn(outputs_depart, labels_depart)
        loss_arrival = loss_fn(outputs_arrival, labels_arrival)
        loss = loss_depart + loss_arrival

        correct_predictions_train += (outputs_depart.argmax(1) == labels_depart).sum().item()
        correct_predictions_train += (outputs_arrival.argmax(1) == labels_arrival).sum().item()

        losses_train.append(loss.item())

        loss.backward()
        optimizer.step()

    model = model.eval()

    for d in data_loader_valid:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels_depart = d["departure"].to(device)
        labels_arrival = d["arrival"].to(device)

        outputs_depart, outputs_arrival = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss_depart = loss_fn(outputs_depart, labels_depart)
        loss_arrival = loss_fn(outputs_arrival, labels_arrival)
        loss = loss_depart + loss_arrival

        correct_predictions_valid += (outputs_depart.argmax(1) == labels_depart).sum().item()
        correct_predictions_valid += (outputs_arrival.argmax(1) == labels_arrival).sum().item()

        losses_valid.append(loss.item())

    train_acc = correct_predictions_train / (2 * len(data_loader_train.dataset))
    train_loss = np.mean(losses_train)

    valid_acc = correct_predictions_valid / (2 * len(data_loader_valid.dataset))
    valid_loss = np.mean(losses_valid)

    return {"train_acc": train_acc, "train_loss": train_loss, "valid_acc": valid_acc, "valid_loss": valid_loss}

In [378]:
sentence = f"Je souhaite aller de {le_depart.classes_[0]} à {le_arrival.classes_[0]}."
# Tokenisez la phrase
inputs = tokenizer(sentence, return_tensors="pt")

# Récupérez les input_ids et attention_mask
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]


In [379]:
print(sentence)

with torch.no_grad():  # Désactivez la grad pour économiser de la mémoire
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# print(torch.max(outputs[0], 1))
depart = le_depart.inverse_transform(torch.max(outputs[0], 1).indices)
arrival = le_arrival.inverse_transform(torch.max(outputs[1], 1).indices)
print(f"Depart: {depart[0]}, Arrival: {arrival[0]}")

Je souhaite aller de Aixe-sur-Vienne à Aire-sur-la-Lys.
Depart: Langeac, Arrival: Flez-Cuzy-Tannay


In [380]:
history = []
epochs = 5

for epoch in range(epochs):
    results = train_epoch(
        model,
        train_loader,
        val_loader,
        loss_fn,
        optimizer,
        'cpu',
        epoch + 1
    )
    print(results)
    history.append(results)

Epoch 1: 100%|████████████████████████████████████| 100/100 [01:11<00:00,  1.41batch/s]


{'train_acc': 0.045, 'train_loss': 7.21910728931427, 'valid_acc': 0.05, 'valid_loss': 7.1351673126220705}


Epoch 2: 100%|████████████████████████████████████| 100/100 [01:16<00:00,  1.31batch/s]


{'train_acc': 0.045, 'train_loss': 7.200993709564209, 'valid_acc': 0.075, 'valid_loss': 7.3102757930755615}


Epoch 3: 100%|████████████████████████████████████| 100/100 [01:11<00:00,  1.41batch/s]


{'train_acc': 0.11, 'train_loss': 7.027466611862183, 'valid_acc': 0.325, 'valid_loss': 7.0269640445709225}


Epoch 4: 100%|████████████████████████████████████| 100/100 [01:09<00:00,  1.43batch/s]


{'train_acc': 0.305, 'train_loss': 6.333118481636047, 'valid_acc': 0.4, 'valid_loss': 6.4460768699646}


Epoch 5: 100%|████████████████████████████████████| 100/100 [01:10<00:00,  1.43batch/s]


{'train_acc': 0.48, 'train_loss': 5.544565706253052, 'valid_acc': 0.5, 'valid_loss': 5.670116710662842}


In [401]:
sentence = f"Je souhaite aller de la gare de {le_depart.classes_[1]} à la gare de Byans en passant par {le_arrival.classes_[2]}."
# Tokenisez la phrase
inputs = tokenizer(sentence, return_tensors="pt")

# Récupérez les input_ids et attention_mask
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

print(sentence)

with torch.no_grad():  # Désactivez la grad pour économiser de la mémoire
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# print(torch.max(outputs[0], 1))
depart = le_depart.inverse_transform(torch.max(outputs[0], 1).indices)
arrival = le_arrival.inverse_transform(torch.max(outputs[1], 1).indices)
print(f"Depart: {depart[0]}, Arrival: {arrival[0]}")

Je souhaite aller de la gare de Ambérieu à la gare de Byans en passant par Chamelet.
Depart: Bischheim, Arrival: Byans


In [237]:
model.out_arrival = nn.Linear(model.bert.config.hidden_size, len(le_arrival.classes_))
model.out_depart = nn.Linear(model.bert.config.hidden_size, len(le_depart.classes_))