# Install Library

In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 16.1 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 54.7 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 54.5 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 60.2 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 46.3 MB/s 
Installing coll

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 12.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 33.5 MB/s 
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.12.1 transformers-4.22.1


# Explore Data

<pre>
'version':,   
'data:  
  [  
    {
        'id': some str as id, 
        'type': 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'
        'question': tokenized question into list of words,
        'context': tokenized context into list of words,
        'num_span': int, number of answer spans,
        'label': list of BIO tag as label,
        'structure': 'Complex', 'Conjunction', 'Non-Redundant', 'Redundant', 'Share'
    },
  ]
</pre>

# Github repo

In [3]:
!git clone https://github.com/haonan-li/MultiSpanQA.git

Cloning into 'MultiSpanQA'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 32 (delta 14), reused 30 (delta 12), pack-reused 0[K
Unpacking objects: 100% (32/32), done.


In [4]:
%cd /content/MultiSpanQA

/content/MultiSpanQA


In [5]:
!pwd

/content/MultiSpanQA


# DataLoader

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

In [86]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
from datasets import load_dataset, load_metric


data_files = {'train': '/content/MultiSpanQA/data/MultiSpanQA_data/train.json',
              'validation': '/content/MultiSpanQA/data/MultiSpanQA_data/valid.json'}
raw_datasets = load_dataset('json', field='data', data_files=data_files)



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-a9d7d6d58c5c4dad/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-a9d7d6d58c5c4dad/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
train_examples = raw_datasets["train"]

In [9]:
type(train_examples)

datasets.arrow_dataset.Dataset

In [10]:
label_list = ["B", "I", "O"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}

structure_list = ['Complex', 'Conjunction', 'Non-Redundant', 'Redundant', 'Share', '']
structure_to_id = {l: i for i, l in enumerate(structure_list)}

In [11]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=True,
        is_split_into_words=True,
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["labels"] = []
    tokenized_examples["num_span"] = []
    tokenized_examples["structure"] = []
    tokenized_examples["example_id"] = []
    # tokenized_examples["word_ids"] = []
    # tokenized_examples["sequence_ids"] = []

    for i, sample_index in enumerate(sample_mapping):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Start token index of the current span in the text.
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        label = examples['label'][sample_index]
        word_ids = tokenized_examples.word_ids(i)
        previous_word_idx = None
        label_ids = [-100] * token_start_index

        for word_idx in word_ids[token_start_index:]:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        tokenized_examples["labels"].append(label_ids)
        tokenized_examples["num_span"].append(examples['num_span'][sample_index] / 30)
        tokenized_examples["structure"].append(structure_to_id[examples['structure'][sample_index] if 'structure' in examples else ''])
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # tokenized_examples["word_ids"].append(word_ids)
        # tokenized_examples["sequence_ids"].append(sequence_ids)
    return tokenized_examples

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
            'bert-base-uncased',
            use_fast=True,
            use_auth_token=False,
            add_prefix_space=True,
         )

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
column_names = raw_datasets["train"].column_names
features = raw_datasets["train"].features

In [14]:
column_names

['id', 'type', 'question', 'context', 'num_span', 'label', 'structure']

In [15]:
train_dataset = train_examples.map(
                prepare_train_features,
                batched=True,
                remove_columns=column_names,
                desc="Running tokenizer on train dataset",
            )

Running tokenizer on train dataset:   0%|          | 0/6 [00:00<?, ?ba/s]

In [16]:
type(train_dataset)

datasets.arrow_dataset.Dataset

In [17]:
train_dataset.__getitem__(0).keys()

dict_keys(['num_span', 'structure', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'example_id'])

In [18]:
len(train_dataset)

5616

In [19]:
def convert_to_tensor(batch):
    new_batch = {}
    new_batch['example_id'] = [example['example_id'] for example in batch]
    new_batch['num_span'] = torch.Tensor([example['num_span'] for example in batch])
    new_batch['structure'] = torch.LongTensor([example['structure'] for example in batch])
    new_batch['input_ids'] = torch.stack([torch.LongTensor(example['input_ids']) for example in batch])
    new_batch['token_type_ids'] = torch.stack([torch.LongTensor(example['token_type_ids']) for example in batch])
    new_batch['attention_mask'] = torch.stack([torch.LongTensor(example['attention_mask']) for example in batch])
    new_batch['labels'] = torch.stack([torch.LongTensor(example['labels']) for example in batch])

    return new_batch

In [55]:
dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=convert_to_tensor)

RoBERTa doesn't use token_type_ids

In [56]:
item = next(iter(dataloader))

In [57]:
item.keys()

dict_keys(['example_id', 'num_span', 'structure', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [58]:
item['token_type_ids'].shape

torch.Size([2, 512])

# Models

In [31]:
import torch.nn as nn
from transformers import AutoModelForTokenClassification, AutoModel

class MultiSpanQATagger(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=3)
        # self.model = AutoModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)
        return outputs

In [32]:
model = MultiSpanQATagger()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

## Train

In [59]:
out = model(item['input_ids'], item['attention_mask'], item['token_type_ids'])

In [60]:
out['logits'].shape

torch.Size([2, 512, 3])

In [61]:
item['labels'].shape

torch.Size([2, 512])

In [83]:
loss = nn.CrossEntropyLoss(reduction='mean')

In [63]:
labels = item['labels'].reshape(-1)

In [64]:
labels.shape

torch.Size([1024])

In [65]:
outputs = out['logits'].reshape(out['logits'].shape[0] * out['logits'].shape[1], -1)

In [84]:
l = loss(outputs, labels)

In [85]:
l

tensor(0.9499, grad_fn=<NllLossBackward0>)

In [77]:
l = l.reshape(out['logits'].shape[0], out['logits'].shape[1])

In [82]:
l.sum(axis=1)

tensor([331.2832, 155.0878], grad_fn=<SumBackward1>)

In [88]:
from torch.optim import AdamW

torch.manual_seed(0)
epochs = 4
print_every = 1
optim = AdamW(model.parameters(), lr=1e-5)
loss_func = nn.CrossEntropyLoss(reduction='mean')

for epoch in range(epochs):
    # Set model in train mode
    model.train()
    loss_of_epoch = 0

    print("############Train############")
    for batch_idx, batch in enumerate(dataloader):
        optim.zero_grad()
        out = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
        labels = item['labels'].reshape(-1)
        outputs = out['logits'].reshape(out['logits'].shape[0] * out['logits'].shape[1], -1)
        loss = loss_func(outputs, labels)
            
        loss.backward()
        optim.step()
        loss_of_epoch += loss.item()
        if (batch_idx + 1) % print_every == 0:
            print("Batch {:} / {:}".format(batch_idx + 1, len(dataloader)))
            print("Loss:", round(loss.item(), 2))
            # torch.save(model, "/kaggle/working/yolo_qa.pth")
        break
    loss_of_epoch /= len(dataloader)
    print("\n-------Epoch ", epoch + 1,
        "-------"
        "\nTraining Loss:", loss_of_epoch,
        "\n-----------------------",
        "\n\n")
    break

############Train############
Batch 1 / 2808
Loss: 0.81

-------Epoch  1 -------
Training Loss: 0.0002872545452539058 
----------------------- 


