# Obtenir une prédiction à partir d'un checkpoint

In [1]:
!pip install lightning
!pip install -U 'wandb>=0.12.10'
!pip install pytorch-lightning --quiet



In [2]:
# importing libraries
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import lightning as L
import torch
import torchmetrics
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from torcheval.metrics import MultilabelAUPRC
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

class ToxicCommentTagger(L.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.classifier = nn.Linear(bert_model.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = bert_model(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)    
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    metric_auprc = MultilabelAUPRC(num_labels=len(classes), average=None)
    aupc = metric_auprc.update(outputs, labels).compute().tolist()
    for name_class, m in zip(classes, aupc):
      self.log(f"aupc_{name_class}", m, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def on_train_epoch_end(self, *args):
    pass
  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )
     

In [6]:
classes = ['Compression',
 'Explanation',
 'Modulation',
 'Omission',
 'Substitution',
 'Synonymy',
 'Syntactic',
 'Transcription',
 'Transposition']
BERT_MODEL_NAME = "bert-base-multilingual-cased"
model = ToxicCommentTagger.load_from_checkpoint("/home/alex/CODE/genai-for-public-good/notebooks/tb_logs/my_model/version_2/checkpoints/epoch=9-step=690.ckpt",
n_classes=len(classes))
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
# disable randomness, dropout, etc...
model.eval()

ToxicCommentTagger(
  (classifier): Linear(in_features=768, out_features=9, bias=True)
  (criterion): BCELoss()
)

In [15]:
def pipeline(text:str):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors='pt',
        )
    bert_model.to("cuda")
    _, test_prediction = model(encoding["input_ids"].to(model.device), encoding["attention_mask"].to(model.device))
    test_prediction = test_prediction.flatten().cpu().detach().numpy()
    bert_model.to("cpu")
    result = {}
    for label, prediction in zip(classes, test_prediction):
        result[label] = float(prediction)
    return result

test_comment = "Une adhésion de 15 € / an à l'association est demandée lors de l'inscription."
test_comment = "Accompagnement individuel"

In [16]:
pipeline(test_comment)

{'Compression': 0.2803936004638672,
 'Explanation': 0.3854008913040161,
 'Modulation': 0.30389881134033203,
 'Omission': 0.31148865818977356,
 'Substitution': 0.34294041991233826,
 'Synonymy': 0.24214085936546326,
 'Syntactic': 0.30812281370162964,
 'Transcription': 0.35744500160217285,
 'Transposition': 0.30052682757377625}