In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer

In [None]:
import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(os.path.abspath(src_dir))

from importlib import reload

import model_2.data_type.enum
import model_2.utils.data_loader

reload(model_2.data_type.enum)
reload(model_2.utils.data_loader)

from model_2.data_type.enum import CityType 
from model_2.utils.data_loader import DataEncoder

In [3]:
train_df = pd.read_csv('../data/dataset/model2_train.csv', sep=',')
val_df = pd.read_csv('../data/dataset/model2_valid.csv', sep=',')
test_df = pd.read_csv('../data/dataset/model2_test.csv', sep=',')

In [4]:
batch_size = 30
epochs = 2
learning_rate = 2e-5

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
bert_model = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased")
max_len = 128



In [5]:
le_statuses = LabelEncoder()

classes = CityType._member_names_
le_statuses.fit(classes)

train_df['class'] = le_statuses.transform(train_df['class'])
val_df['class'] = le_statuses.transform(val_df['class'])
test_df['class'] = le_statuses.transform(test_df['class'])

In [6]:
train_dataset = DataEncoder(train_df["sentence"], train_df["class"], tokenizer, max_len)
val_dataset = DataEncoder(val_df["sentence"], val_df["class"], tokenizer, max_len)

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

In [7]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, n_classes):
        super(BertForSequenceClassification, self).__init__()
        self.bert = bert_model
        self.drop = nn.Dropout(p=0.3)
        self.out_linear = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]

        output_depart = self.out_linear(self.drop(pooled_output))

        return output_depart

In [8]:
model = BertForSequenceClassification(len(le_statuses.classes_))
model.load_state_dict(torch.load("./processed/model_2/model.pth"))
model = model.to(device)
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate
)
loss_fn = nn.CrossEntropyLoss().to(device)

  model.load_state_dict(torch.load("./processed/model_2/model.pth"))


In [9]:
def train_epoch(model, data_loader_train, data_loader_valid, loss_fn, optimizer, device, current_epoch):
    model = model.train()
    losses_train = []
    losses_valid = []
    correct_predictions_train = 0
    correct_predictions_valid = 0

    with tqdm(total=len(data_loader_train), desc=f"Epoch {current_epoch}", unit="batch") as pbar:
        for d in data_loader_train:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["class_name"].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss = loss_fn(outputs, labels)

            correct_predictions_train += (outputs.argmax(1) == labels).sum().item()
            losses_train.append(loss.item())

            loss.backward()
            optimizer.step()

            pbar.update(1)

    model = model.eval()

    for d in data_loader_valid:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels= d["class_name"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs, labels)

        correct_predictions_valid += (outputs.argmax(1) == labels).sum().item()
        losses_valid.append(loss.item())

    train_acc = correct_predictions_train / len(data_loader_train.dataset)
    train_loss = np.mean(losses_train)

    valid_acc = correct_predictions_valid / len(data_loader_valid.dataset)
    valid_loss = np.mean(losses_valid)

    return {"train_acc": train_acc, "train_loss": train_loss, "valid_acc": valid_acc, "valid_loss": valid_loss}

In [11]:
history = []

for epoch in range(epochs):
    results = train_epoch(
        model,
        train_loader,
        val_loader,
        loss_fn,
        optimizer,
        device,
        epoch + 1
    )
    print(results)
    history.append(results)

Epoch 1:   0%|          | 0/2857 [00:00<?, ?batch/s]

{'train_acc': 0.9818175452519052, 'train_loss': 0.050076003502920324, 'valid_acc': 0.9999455367354719, 'valid_loss': 0.0006825513058339381}


Epoch 2:   0%|          | 0/2857 [00:00<?, ?batch/s]

{'train_acc': 0.9991714028965888, 'train_loss': 0.003452826998613531, 'valid_acc': 0.9998910734709439, 'valid_loss': 0.0005522556844230604}


In [12]:
torch.save(model.state_dict(), "./processed/model_2/model.pth")
results_df = pd.DataFrame(history)
results_df.to_csv("./processed/model_2/model.csv")

In [9]:
model.to("cpu")
# model.load_state_dict(torch.load("./processed/model_2/model.pth"))

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [40]:
sentence = f"Je souhaiterais aller de villars les blamont à les villedieu en evitant de paser par [[chauny]], ou chezy en orxois et chivy les etouvelles."

# Tokenisez la phrase
inputs = tokenizer(sentence, return_tensors="pt")

# Récupérez les input_ids et attention_mask
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

outputs = model(input_ids=input_ids, attention_mask=attention_mask)

status = le_statuses.inverse_transform(torch.max(outputs, 1).indices)

print(sentence)
print(f"Status: {status[0]}")

Je souhaiterais aller de villars les blamont à les villedieu en evitant de paser par [[chauny]], ou chezy en orxois et chivy les etouvelles.
Status: intermediary


In [10]:
def make_inference(model, sentence, tokenizer, label_encoder):
    inputs = tokenizer(sentence, return_tensors="pt")

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    output = label_encoder.inverse_transform(torch.max(outputs, 1).indices)
    return output[0]

In [11]:
def get_pred_from_sentence(sentence, model, tokenizer, label_encoder, cities):
    response = []

    for city in cities:
        sentence = sentence.replace(city, f"[[{city}]]")

        output = make_inference(
            model,
            sentence,
            tokenizer,
            label_encoder
        )

        sentence = sentence.replace(f"[[{city}]]", city)
        response.append(
            {
                "status": output,
                "city": city
            }
        )

    return response 

In [12]:
ville_1 = "abergement le petit"
ville_2 = "viodos abense de bas"
ville_3 = "Marseille"
ville_4 = "chauny"
ville_5 = "bucy le long"
ville_6 = "bourguignon sous coucy"
sentence = f"Je veux passer par {ville_5}. Je veux aller à {ville_1} depuis {ville_2} en passant par {ville_3}, mais je veux éviter {ville_4}. Je voudrais aussi passer par {ville_6}." 
cities = [ville_1, ville_2, ville_3, ville_4, ville_5, ville_6]

sentence_display = f"Je veux passer par {ville_5}. Je veux aller à {ville_1} depuis {ville_2} en passant par {ville_3}, \nmais je veux éviter {ville_4}. Je voudrais aussi passer par {ville_6}." 
print(sentence_display)
get_pred_from_sentence(
    sentence,
    model,
    tokenizer,
    le_statuses,
    cities
)


Je veux passer par bucy le long. Je veux aller à abergement le petit depuis viodos abense de bas en passant par Marseille, 
mais je veux éviter chauny. Je voudrais aussi passer par bourguignon sous coucy.


[{'status': 'arrival', 'city': 'abergement le petit'},
 {'status': 'departure', 'city': 'viodos abense de bas'},
 {'status': 'intermediary', 'city': 'Marseille'},
 {'status': 'none', 'city': 'chauny'},
 {'status': 'intermediary', 'city': 'bucy le long'},
 {'status': 'intermediary', 'city': 'bourguignon sous coucy'}]

In [14]:
ville_1 = "Bosmont sur Serre"
ville_2 = "Brancourt en Laonnois"
ville_3 = "Chatillon sur Oise"
sentence = f"Je souhaite partir de {ville_1} à {ville_2} en passant par {ville_3}." 
cities = [ville_1, ville_2, ville_3]

print(sentence)
get_pred_from_sentence(
    sentence,
    model,
    tokenizer,
    le_statuses,
    cities
)



Je souhaite partir de Bosmont sur Serre à Brancourt en Laonnois en passant par Chatillon sur Oise.


[{'status': 'departure', 'city': 'Bosmont sur Serre'},
 {'status': 'arrival', 'city': 'Brancourt en Laonnois'},
 {'status': 'intermediary', 'city': 'Chatillon sur Oise'}]