In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer

In [2]:
import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(os.path.abspath(src_dir))

from importlib import reload

import model_2.utils.data_loader

reload(model_2.utils.data_loader)

from model_2.utils.data_loader import DataEncoderNER



In [3]:
train_df = pd.read_csv('../data/dataset/ner_train.csv', sep=',')
val_df = pd.read_csv('../data/dataset/ner_valid.csv', sep=',')
test_df = pd.read_csv('../data/dataset/ner_test.csv', sep=',')

In [57]:
batch_size = 30
epochs = 1
learning_rate = 2e-5

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
bert_model = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased")
max_len = 128

In [5]:
le_sub_sentences = LabelEncoder()

classes = ["I", "O", "B"]
le_sub_sentences.fit(classes)

train_df['status'] = le_sub_sentences.transform(train_df['status'])
val_df['status'] = le_sub_sentences.transform(val_df['status'])
test_df['status'] = le_sub_sentences.transform(test_df['status'])

In [6]:
train_dataset = DataEncoderNER(train_df["sub_sentence"], train_df["status"], tokenizer, max_len)
val_dataset = DataEncoderNER(val_df["sub_sentence"], val_df["status"], tokenizer, max_len)

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

In [7]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, n_classes):
        super(BertForSequenceClassification, self).__init__()
        self.bert = bert_model
        self.drop = nn.Dropout(p=0.3)
        self.out_linear = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]

        output_depart = self.out_linear(self.drop(pooled_output))

        return output_depart

In [58]:
model = BertForSequenceClassification(len(le_sub_sentences.classes_))
model.load_state_dict(torch.load("./processed/model_2/NER_model.pth"))
model = model.to(device)
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate
)
loss_fn = nn.CrossEntropyLoss().to(device)

  model.load_state_dict(torch.load("./processed/model_2/NER_model.pth"))


In [59]:
def train_epoch(model, data_loader_train, data_loader_valid, loss_fn, optimizer, device, current_epoch):
    model = model.train()
    losses_train = []
    losses_valid = []
    correct_predictions_train = 0
    correct_predictions_valid = 0

    with tqdm(total=len(data_loader_train), desc=f"Epoch {current_epoch}", unit="batch") as pbar:
        for d in data_loader_train:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["status"].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss = loss_fn(outputs, labels)

            correct_predictions_train += (outputs.argmax(1) == labels).sum().item()
            losses_train.append(loss.item())

            loss.backward()
            optimizer.step()

            pbar.update(1)

    model = model.eval()

    for d in data_loader_valid:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels= d["status"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs, labels)

        correct_predictions_valid += (outputs.argmax(1) == labels).sum().item()
        losses_valid.append(loss.item())

    train_acc = correct_predictions_train / len(data_loader_train.dataset)
    train_loss = np.mean(losses_train)

    valid_acc = correct_predictions_valid / len(data_loader_valid.dataset)
    valid_loss = np.mean(losses_valid)

    return {"train_acc": train_acc, "train_loss": train_loss, "valid_acc": valid_acc, "valid_loss": valid_loss}

In [60]:
history = []

for epoch in range(epochs):
    results = train_epoch(
        model,
        train_loader,
        val_loader,
        loss_fn,
        optimizer,
        device,
        epoch + 1
    )
    print(results)
    history.append(results)

Epoch 1:   0%|          | 0/2474 [00:00<?, ?batch/s]

{'train_acc': 0.9990568325743081, 'train_loss': 0.003892944736765435, 'valid_acc': 0.9981136820925554, 'valid_loss': 0.005741413059782686}


In [70]:
torch.save(model.state_dict(), "./processed/model_2/NER_model.pth")
results_df = pd.DataFrame(history)
results_df.to_csv("./processed/model_2/NER_model.csv")

In [17]:
sentence = f"END[O] Paris Marseille"

# Tokenisez la phrase
inputs = tokenizer(sentence, return_tensors="pt")

# Récupérez les input_ids et attention_mask
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

outputs = model(input_ids=input_ids, attention_mask=attention_mask)

status = le_sub_sentences.inverse_transform(torch.max(outputs, 1).indices)

print(sentence)
print(f"Status: {status[0]}")

END[O] Paris Marseille
Status: O


In [76]:
def make_inference(model, sentence, tokenizer, label_encoder):
    inputs = tokenizer(sentence, return_tensors="pt")

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    status = label_encoder.inverse_transform(torch.max(outputs, 1).indices)
    return status[0]

In [73]:
import spacy
nlp = spacy.load("fr_core_news_sm")

In [77]:
def get_names_from_sentence(sentence, model, tokenizer, label_encoder):
    sentence = f"END {sentence.lower()} END"
    doc_sentence = nlp(sentence)
    tagged_sentence = ""

    for idx, word in enumerate([token_word.text for token_word in doc_sentence]):
        sub_sentence = doc_sentence[idx].text

        if(idx + 1 > 1):
            sub_sentence = f"{sub_sentence}[{status}] "
            tagged_sentence += f"{sub_sentence}"

        for word in doc_sentence[idx + 1 : idx + 7]:
            word_to_add = word.text
            if(word.text == "."):
                word_to_add = "None"
            sub_sentence = f"{sub_sentence} {word_to_add}"


        status = make_inference(
            model,
            sub_sentence,
            tokenizer,
            label_encoder
        )

    individual_words = tagged_sentence.split()

    names = []
    for idx, word in enumerate(individual_words):
        status = word[-3:]
        trimmed_word = word[:-3]
        if(status == "[B]"):
            names.append(trimmed_word)

        if(status == "[I]"):
            if(trimmed_word == "."):
                continue
            names[len(names) - 1] = f"{names[len(names) - 1]} {trimmed_word}"

    return names

In [78]:
ville_1 = "abergement le petit"
ville_2 = "viodos abense de bas"
sentence = f"Je souhaite aller d'{ville_1} à {ville_2} en passant par Marseille. Tu sais ou je dois aller ?" 

get_names_from_sentence(
    sentence,
    model,
    tokenizer,
    le_sub_sentences
)


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [None]:
# Je souhaite aller de [[Paris]] à [[Marseille]] en passant par [[Lyon]].