In [32]:
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch import nn
from tqdm import tqdm
import numpy as np
import datasets
import torch
from sklearn.metrics import accuracy_score


In [23]:
class XNLI_Dataset(Dataset):
    def __init__(self, kind = 'train'):
        self.tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
        xnli_dataset_en = datasets.load_dataset('xnli', language='en')[kind].select(range(1000))
        xnli_dataset_de = datasets.load_dataset('xnli', language='de')[kind].select(range(1000))
        xnli_dataset_fr = datasets.load_dataset('xnli', language='fr')[kind].select(range(1000))
        SEED = 89
        dataset = datasets.concatenate_datasets([xnli_dataset_en, xnli_dataset_de, xnli_dataset_fr]).shuffle(seed=SEED)
        self.premises_tokenized = [self.tokenizer(text, return_tensors='pt', padding='max_length') for text in dataset['premise']]
        self.hypothesis_tokenized = [self.tokenizer(text, return_tensors='pt', padding='max_length') for text in dataset['hypothesis']]
        self.labels = dataset['label']
    
    def __getitem__(self, idx):
        return self.premises_tokenized[idx], self.hypothesis_tokenized[idx], self.labels[idx]
    
    def __len__(self):
        return len(self.labels)

train_ds, val_ds = XNLI_Dataset('train'), XNLI_Dataset('validation')


Found cached dataset xnli (/Users/pszachew/.cache/huggingface/datasets/xnli/default-language=en/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset xnli (/Users/pszachew/.cache/huggingface/datasets/xnli/default-language=de/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset xnli (/Users/pszachew/.cache/huggingface/datasets/xnli/default-language=fr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /Users/pszachew/.cache/huggingface/datasets/xnli/default-language=en/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd/cache-b49f5269a56ab5c5.arrow
Found cached dataset xnli (/Users/pszachew/.cache/huggingface/datasets/xnli/default-language=en/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset xnli (/Users/pszachew/.cache/huggingface/datasets/xnli/default-language=de/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset xnli (/Users/pszachew/.cache/huggingface/datasets/xnli/default-language=fr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /Users/pszachew/.cache/huggingface/datasets/xnli/default-language=en/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd/cache-0dd3afacbef3b821.arrow


In [24]:
train_dataloader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=8)

In [16]:
for batch in val_dataloader:
    print(batch[1]['input_ids'].squeeze(1).shape)
    print(batch[1]['attention_mask'].shape)
    break

torch.Size([2, 512])
torch.Size([2, 1, 512])


In [25]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
#device = torch.device("cpu")
device

device(type='mps')

In [26]:
class XLM_RoBERTa_classifier(torch.nn.Module):
    def __init__(self, model_checkpoint:str = 'xlm-roberta-base', dropout:float = 0.2):
        super(XLM_RoBERTa_classifier, self).__init__()
        self.roberta = AutoModel.from_pretrained(model_checkpoint)
        self.dropout = torch.nn.Dropout(dropout)
        self.linear = torch.nn.Linear(2*768, 3)
        self.softmax = torch.nn.Softmax()
    
    def forward(self, input_ids_hypo, attention_mask_hypo, input_ids_premise, attention_mask_premise):
        pooled_output_hypothesis = self.roberta(input_ids_hypo, attention_mask_hypo)
        pooled_output_premise = self.roberta(input_ids_premise, attention_mask_premise)
        cat_output = torch.cat((pooled_output_hypothesis[1], pooled_output_premise[1]), dim=1)
        dropout_output = self.dropout(cat_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.softmax(linear_output)
        return final_layer
    


In [17]:
model = XLM_RoBERTa_classifier()
outputs = model(batch[1]['input_ids'].squeeze(1), batch[1]['attention_mask'].squeeze(1), batch[0]['input_ids'].squeeze(1), batch[0]['attention_mask'].squeeze(1))

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.softmax(linear_output)


In [27]:
model = XLM_RoBERTa_classifier()
optimizer = Adam(model.parameters(), lr=5e-5)
model.to(device)
num_epochs = 1
loss_fun = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    losses = []
    for batch in tqdm(train_dataloader):

        labels = batch[2].to(device)
        
        
        outputs = model(batch[1]['input_ids'].squeeze(1).to(device), batch[1]['attention_mask'].squeeze(1).to(device), batch[0]['input_ids'].squeeze(1).to(device), batch[0]['attention_mask'].squeeze(1).to(device))
        loss = loss_fun(outputs, labels)
        loss.backward()

        optimizer.step()
#         lr_scheduler.step()
        optimizer.zero_grad()
        # progress_bar.update(1)
        losses.append(loss.item())
        print(np.mean(losses))
    print(np.mean(losses))

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/375 [00:00<?, ?it/s]


NotImplementedError: The operator 'aten::cumsum.out' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

In [36]:
accuracy_score([1,0],outputs.argmax(dim=1))

0.5

In [42]:
t = torch.tensor([[1.0, 5.0, 10.0], [1.0, 2.0, 3.0]])
softmax = torch.nn.Softmax(dim=1)
softmax(t)

tensor([[1.2257e-04, 6.6920e-03, 9.9319e-01],
        [9.0031e-02, 2.4473e-01, 6.6524e-01]])

In [34]:
outputs

tensor([[0.3175, 0.3294, 0.3531],
        [0.3912, 0.2814, 0.3275]], grad_fn=<SoftmaxBackward0>)

In [30]:
outputs

tensor([[0.3175, 0.3294, 0.3531],
        [0.3912, 0.2814, 0.3275]], grad_fn=<SoftmaxBackward0>)