In [None]:
#!pip3 install --user datasets

In [None]:
#!pip3 install --user transformers -U

In [1]:
import os

import torch
import torch.nn as nn

from datasets import load_dataset

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import get_linear_schedule_with_warmup

In [2]:
config = {
    "BATCH_SIZE": 32,
    "NUM_LABEL": 3,
    "NUM_EPOCH": 10,
    "CATEGORY": {
        0: "contradiction",
        1: "neutral",
        2: "entailment"
    }
}
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
snli = load_dataset("snli")
snli = snli.filter(lambda example: example['label'] != -1)

W0408 16:55:30.316817 140594607580992 builder.py:512] Reusing dataset snli (/usr/users/gpusdi1/gpusdi1_12/.cache/huggingface/datasets/snli/plain_text/1.0.0/bb1102591c6230bd78813e229d5dd4c7fbf4fc478cec28f298761eb69e5b537c)
W0408 16:55:30.337524 140594607580992 arrow_dataset.py:1636] Loading cached processed dataset at /usr/users/gpusdi1/gpusdi1_12/.cache/huggingface/datasets/snli/plain_text/1.0.0/bb1102591c6230bd78813e229d5dd4c7fbf4fc478cec28f298761eb69e5b537c/cache-3d6b52b60d37ce7c.arrow
W0408 16:55:30.344197 140594607580992 arrow_dataset.py:1636] Loading cached processed dataset at /usr/users/gpusdi1/gpusdi1_12/.cache/huggingface/datasets/snli/plain_text/1.0.0/bb1102591c6230bd78813e229d5dd4c7fbf4fc478cec28f298761eb69e5b537c/cache-4d19561d1c41d2b0.arrow
W0408 16:55:30.356946 140594607580992 arrow_dataset.py:1636] Loading cached processed dataset at /usr/users/gpusdi1/gpusdi1_12/.cache/huggingface/datasets/snli/plain_text/1.0.0/bb1102591c6230bd78813e229d5dd4c7fbf4fc478cec28f298761eb69e5

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [6]:
def preprocess(dataset, tokenizer):
    
    def encode(data):
        return tokenizer(data["hypothesis"], data["premise"], truncation=True, padding='max_length')
    
    dataset_pre_processed = dataset.map(encode, batched=True, batch_size=config["BATCH_SIZE"], num_proc=os.cpu_count())
    dataset_pre_processed.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) # The model needs tensor as inputs
    dataset_pre_processed = dataset_pre_processed.rename_column("label", "labels") # The model takes as arg labels not label
    
    return dataset_pre_processed["train"], dataset_pre_processed["validation"], dataset_pre_processed["test"] 

In [None]:
train_data, validation_data, test_data = preprocess(dataset=snli, tokenizer=tokenizer)

In [None]:
train_data_loader = torch.utils.data.DataLoader(dataset=train_data,
                                              batch_size=config["BATCH_SIZE"],
                                              shuffle=False)
validation_data_loader = torch.utils.data.DataLoader(dataset=validation_data,
                                              batch_size=config["BATCH_SIZE"],
                                              shuffle=False)
test_data_loader = torch.utils.data.DataLoader(dataset=test_data,
                                              batch_size=config["BATCH_SIZE"],
                                              shuffle=False)

In [None]:
def train(data_loader, model, optimizer, scheduler, device, verbose=False):
    model.train()
    total_number, total_loss = 0, 0.0
    for i, data in enumerate(data_loader):
        if verbose and i%100==0:
            print(f'\t[Training] Progress: {i}/{len(data_loader)}')
        inputs = {
            "input_ids": data["input_ids"].to(device),
            "labels": data["labels"].to(device),
            "attention_mask": data["attention_mask"].to(device)
        }
        outputs = model(inputs["input_ids"], inputs["attention_mask"], labels=inputs["labels"])
        loss = outputs.loss
        loss.backward()
        total_number += inputs["input_ids"].shape[0]
        total_loss += inputs["input_ids"].shape[0] * loss.item()
        
        optimizer.step()
        optimizer.zero_grad()
        
        if i == 1000:
            break
        
    return total_loss/total_number

In [None]:
def test(data_loader, model, device, verbose=False):
    model.eval()
    total_number, total_loss = 0, 0.0
    with torch.no_grad():
        for i, data in enumerate(data_loader):
            if verbose and i%100==0:
                print(f'\t[Testing] Progress: {i}/{len(data_loader)}')
            inputs = {
                "input_ids": data["input_ids"].to(device),
                "labels": data["labels"].to(device),
                "attention_mask": data["attention_mask"].to(device)
            }
            outputs = model(inputs["input_ids"], inputs["attention_mask"], labels=inputs["labels"])
            loss = outputs.loss
            total_number += inputs["input_ids"].shape[0]
            total_loss += inputs["input_ids"].shape[0] * loss.item()

    return total_loss/total_number

In [None]:
def train_test(model, train_data_loader, validation_data_loader, test_data_loader, num_epoch, optimizer, scheduler, device, verbose=False):
    for i in range(num_epoch):
        print(f'[Epoch] Epoch number: {i}')
        train_loss = train(train_data_loader, model, optimizer, 
                           scheduler, device, verbose)
        print(f'[Train loss] Train Loss: {train_loss}')
        validation_loss = test(validation_data_loader, model=model, device=device, verbose=verbose)
        print(f'[Validation loss] Validation Loss: {validation_loss}')
        scheduler.step()
    
    test_loss = test(test_data_loader, model, device, verbose=verbose)
    print(f'[Test loss] Test Loss: {test_loss}')

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=config["NUM_LABEL"])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, 5, 2)

In [None]:
model.to(device)
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
train_test(model, train_data_loader, validation_data_loader, test_data_loader, config["NUM_EPOCH"], optimizer, scheduler, device, verbose=True)

In [None]:
def get_prediction(model, tokenizer, hypothesis, premise, device="cpu"):
    """
        Takes as input the model, a tokenizer, an hypothesis and a premise (by default it will be used on cpu).
        It returns the probabilities for each category.
        It also displays the category.
    """
    inputs = tokenizer(hypothesis, premise, truncation=True, padding='max_length')
    model.eval()
    with torch.no_grad():
        inputs = {
                "input_ids": data["input_ids"].to(device),
                "labels": data["labels"].to(device),
                "attention_mask": data["attention_mask"].to(device)
            }
        outputs = model(inputs["input_ids"], inputs["attention_mask"], labels=inputs["labels"])
        
        logits = outputs.logits
        softmax = nn.Softmax(dim=0)
        probabilities = softmax(logits)[0].items()
        index_max = torch.argmax(probabilities, dim=0)
        prediction = probabilities[index_max]
        
        print(f'{hypothesis} and {premise} are: {config["CATEGORY"][index_max]} with a probability of {prediction}')

    return probabilities

## Example

In [None]:
model = torch.load("MODEL_PATH", map_location=device)
model.to(device)

In [None]:
hypothesis = "A soccer game with multiple males playing."
premise = "Some men are playing a sport."
prob = get_prediction(model, tokenizer, hypothesis, premise, device=device)

## VizViz

In [4]:
!pip3 install bertviz --user

Collecting bertviz
[?25l  Downloading https://files.pythonhosted.org/packages/15/8b/f4226c75b35df80504ef41399fc1569b550332e3e4796618e5669c91af55/bertviz-1.0.0-py3-none-any.whl (162kB)
[K     |################################| 163kB 6.0MB/s eta 0:00:01
[?25hCollecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/9f/2d/f094ea90db0ede94ed85cf843da694e18343feed686241af86743f583b00/boto3-1.17.47-py2.py3-none-any.whl (131kB)
[K     |################################| 133kB 15.4MB/s eta 0:00:01
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |################################| 1.2MB 13.7MB/s eta 0:00:01
Collecting s3transfer<0.4.0,>=0.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/98/14/0b4be62b65c52d6d1c442f24e02d2a9889a73d3c352002e14c70f84a679f/s3transfer-0.3.6-py2.py3-none-any

In [4]:
from bertviz import head_view

In [5]:
#model = torch.load("./models/best_model_aug_V2.pt", map_location=device) # <- Marche pas car pas output attention
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=config["NUM_LABEL"], output_attentions=True)
model.to(device)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [10]:
def viz_viz(hypothesis, premise, model, tokenizer, device):
    output_tokenizer = tokenizer(hypothesis, premise, return_tensors="pt", add_special_tokens=True)
    input_ids = output_tokenizer["input_ids"][0].tolist()
    output_tokenizer = output_tokenizer.to(device)
    output_model = model(**output_tokenizer)
    assert "attentions" in output_model, "Model should output attention"
    try:
        attention = output_model["attentions"]
    except KeyError:
        logger.error("Model must ouput attentions")
        return
    input_token = tokenizer.convert_ids_to_tokens(input_ids)
    head_view(attention, input_token)

In [11]:
hypothesis = "Model should output attention"
premise = "Model must ouput attentions"

In [12]:
viz_viz(hypothesis, premise, model, tokenizer, device)

<IPython.core.display.Javascript object>