In [None]:
!pip3 install --user datasets

In [None]:
!pip3 install --user transformers

In [None]:
from datasets import load_dataset
snli = load_dataset("snli")
#Removing sentence pairs with no label (-1)
snli = snli.filter(lambda example: example['label'] != -1)

In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

In [None]:
config = {
    "BATCH_SIZE": 32,
    "NUM_LABEL": 3
}
device = torch.device('cuda')

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=config["NUM_LABEL"])

In [None]:
def preprocess_dataset(dataset, tokenizer):
    dataset_tokenized = dataset.map(lambda item: tokenizer(item["hypothesis"], item["premise"],  padding="longest"), batched=True, batch_size=config["BATCH_SIZE"])
    dataset_tokenized.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label']) # We need pytorch tensorflow as output
    dataset_tokenized = dataset_tokenized.rename_column("label", "labels") # The model takes as arg labels not label
    return dataset_tokenized

In [None]:
train_dataset = snli["train"]
test_dataset = snli["test"]
validation_dataset = snli["validation"]

In [None]:
train_dataset_tokenized = preprocess_dataset(train_dataset, tokenizer)
test_dataset_tokenized = preprocess_dataset(test_dataset, tokenizer)
validation_dataset_tokenized = preprocess_dataset(validation_dataset, tokenizer)

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset_tokenized,
                                              batch_size=config["BATCH_SIZE"],
                                              shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset_tokenized,
                                              batch_size=config["BATCH_SIZE"],
                                              shuffle=False)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset_tokenized,
                                              batch_size=config["BATCH_SIZE"],
                                              shuffle=False)

In [None]:
print(next(iter(train_loader)))

In [None]:
model.to(device)

In [None]:
for _, data in enumerate(test_loader):
    print(f'Shape input_ids: {data["input_ids"].shape}')
    print(f'Shape labels: {data["labels"].shape}')
    print(f'Shape attention masks: {data["attention_mask"].shape}')
    input = {
        "input_ids": data["input_ids"].to(device),
        "labels": data["labels"].to(device),
        "attention_mask": data["attention_mask"].to(device)
    }
    output = model(**input)
    print(output)
    break