In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.backends.cudnn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer

In [4]:
import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(os.path.abspath(src_dir))

from importlib import reload

import src.data.dataset

reload(src.data.dataset)

from src.data.dataset import TrainDataset

In [6]:
train_station_df = pd.read_csv('../data/liste-des-gares.csv', sep=';')

departure_train_df = pd.read_csv('../data/dataset/departure_train.csv')
departure_valid_df = pd.read_csv('../data/dataset/departure_valid.csv')

arrival_train_df = pd.read_csv('../data/dataset/arrival_train.csv')
arrival_valid_df = pd.read_csv('../data/dataset/arrival_valid.csv')

In [7]:
train_df = departure_train_df
valid_df = departure_valid_df

batch_size = 30
epochs = 2
learning_rate = 2e-5

device = "cuda"

# tokenizer = BertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
bert_model = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased")
max_len = 128

number_of_training_sentences_per_station = -1
number_of_training_sentences_per_station_valid = -1
number_of_train_stations = departure_train_df['departure'].nunique()

In [8]:
new_train_df = pd.DataFrame()
new_valid_df = pd.DataFrame()

with tqdm(total=number_of_train_stations) as pbar:
    for i in range(number_of_train_stations):
        if number_of_training_sentences_per_station != -1 :
            train_df_subset = train_df[train_df["departure_train_station_id"] == i]
            new_train_df = pd.concat([new_train_df, train_df_subset])
        
        if number_of_training_sentences_per_station_valid != -1:
            valid_df_subset = valid_df[valid_df["departure_train_station_id"] == i].head(number_of_training_sentences_per_station_valid)
            new_valid_df = pd.concat([new_valid_df, valid_df_subset])

        pbar.update(1)

if number_of_training_sentences_per_station != -1:
    train_df = new_train_df.reset_index(drop=True)
if number_of_training_sentences_per_station_valid != -1:
    valid_df = new_valid_df.reset_index(drop=True)

  0%|          | 0/3469 [00:00<?, ?it/s]

In [9]:
le_train_stations = LabelEncoder()

le_train_stations.fit(train_station_df["LIBELLE"].unique())

train_df['departure'] = le_train_stations.transform(train_df['departure'])
train_df['arrival'] = le_train_stations.transform(train_df['arrival'])

valid_df['departure'] = le_train_stations.transform(valid_df['departure'])
valid_df['arrival'] = le_train_stations.transform(valid_df['arrival'])

In [10]:
train_dataset = TrainDataset(
    train_df["sentence"], train_df["departure"], train_df["arrival"], tokenizer, max_len)
val_dataset = TrainDataset(
    valid_df["sentence"], valid_df["departure"], valid_df["arrival"], tokenizer, max_len)

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

In [11]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, n_depart, n_arrival):
        super(BertForSequenceClassification, self).__init__()
        self.bert = bert_model
        self.drop = nn.Dropout(p=0.3)
        self.out_depart = nn.Linear(self.bert.config.hidden_size, n_depart)
        self.out_arrival = nn.Linear(self.bert.config.hidden_size, n_arrival)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]

        output_depart = self.out_depart(self.drop(pooled_output))
        output_arrival = self.out_arrival(self.drop(pooled_output))

        return output_depart, output_arrival

    def freeze_arrival_layer(self):
        for param in self.out_arrival.parameters():
            param.requires_grad = False

    def unfreeze_arrival_layer(self):
        for param in self.out_arrival.parameters():
            param.requires_grad = True

    def freeze_depart_layer(self):
        for param in self.out_depart.parameters():
            param.requires_grad = False

    def unfreeze_depart_layer(self):
        for param in self.out_depart.parameters():
            param.requires_grad = True

In [13]:
model = BertForSequenceClassification(len(le_train_stations.classes_), len(le_train_stations.classes_))
model.load_state_dict(torch.load("./processed/departure_arrival_model2_trained.pth"))
model = model.to(device)

  model.load_state_dict(torch.load("./processed/departure_arrival_model2_trained.pth"))


In [14]:
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate
)
loss_fn = nn.CrossEntropyLoss().to(device)

def train_epoch(model, data_loader_train, data_loader_valid, loss_fn, optimizer, device, current_epoch):
    model = model.train()
    losses_train = []
    losses_valid = []
    correct_predictions_train = 0
    correct_predictions_valid = 0

    with tqdm(total=len(data_loader_train), desc=f"Epoch {current_epoch}", unit="batch") as pbar:
        for d in data_loader_train:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels_depart = d["departure"].to(device)
            labels_arrival = d["arrival"].to(device)

            optimizer.zero_grad()

            outputs_depart, outputs_arrival = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss_depart = loss_fn(outputs_depart, labels_depart)
            loss_arrival = loss_fn(outputs_arrival, labels_arrival)
            loss = loss_depart + loss_arrival

            correct_predictions_train += (outputs_depart.argmax(1) == labels_depart).sum().item()
            correct_predictions_train += (outputs_arrival.argmax(1) == labels_arrival).sum().item()

            losses_train.append(loss.item())

            loss.backward()
            optimizer.step()

            pbar.update(1)

    model = model.eval()

    for d in data_loader_valid:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels_depart = d["departure"].to(device)
        labels_arrival = d["arrival"].to(device)

        outputs_depart, outputs_arrival = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss_depart = loss_fn(outputs_depart, labels_depart)
        loss_arrival = loss_fn(outputs_arrival, labels_arrival)
        loss =  loss_depart + loss_arrival

        correct_predictions_valid += (outputs_depart.argmax(1) == labels_depart).sum().item()
        correct_predictions_valid += (outputs_arrival.argmax(1) == labels_arrival).sum().item()

        losses_valid.append(loss.item())

    train_acc = correct_predictions_train / (2 * len(data_loader_train.dataset))
    train_loss = np.mean(losses_train)

    valid_acc = correct_predictions_valid / (2 * len(data_loader_valid.dataset))
    valid_loss = np.mean(losses_valid)

    return {"train_acc": train_acc, "train_loss": train_loss, "valid_acc": valid_acc, "valid_loss": valid_loss}

In [15]:
history = []

for epoch in range(epochs):
    results = train_epoch(
        model,
        train_loader,
        val_loader,
        loss_fn,
        optimizer,
        device,
        epoch + 1
    )
    print(results)
    history.append(results)

Epoch 1:   0%|          | 0/17854 [00:00<?, ?batch/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 

In [39]:
torch.save(model.state_dict(), "./model/processed/departure_arrival_model3_trained.pth")

In [19]:
results_df = pd.DataFrame(history)
results_df.to_csv("./model/processed/model3_departure_arrival_training_history.csv")

In [15]:
model = model.to("cuda")

In [16]:
model = model.to("cpu")

In [17]:
sentence = f"{le_train_stations.classes_[1]} {le_train_stations.classes_[60]}"
# Tokenisez la phrase
inputs = tokenizer(sentence, return_tensors="pt")

# Récupérez les input_ids et attention_mask
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

print(sentence)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)

depart = le_train_stations.inverse_transform(torch.max(outputs[0], 1).indices)
arrival = le_train_stations.inverse_transform(torch.max(outputs[1], 1).indices)
print(f"Depart: {depart[0]}, Arrival: {arrival[0]}")

Abbaretz Amiens
Depart: Abbaretz, Arrival: Abbaretz
