In [11]:
!pip install transformers
!pip install datasets
!pip install accelerate

[0mCollecting accelerate
  Downloading accelerate-0.10.0-py3-none-any.whl (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.1/117.1 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.10.0
[0m

In [1]:
import os
import torch.nn as nn
from transformers import BertModel, BertConfig
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from datasets import Dataset, load_dataset
raw_datasets = load_dataset("squad")

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
train = Dataset.from_dict(raw_datasets["train"][0:5829])
val   = Dataset.from_dict(raw_datasets["train"][43463:44786])
test  = Dataset.from_dict(raw_datasets["train"][44786:45428])

In [18]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [5]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [6]:
train_dataset = train.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train.column_names,
)
len(train), len(train_dataset)



  0%|          | 0/6 [00:00<?, ?ba/s]

(5829, 5923)

In [7]:
validation_dataset = val.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=val.column_names,
)
len(val), len(validation_dataset)

  0%|          | 0/2 [00:00<?, ?ba/s]

(1323, 1323)

In [8]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=8
)

In [20]:
from torch.optim import AdamW
from transformers import get_scheduler
from accelerate import Accelerator

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

optimizer = AdamW(model.parameters(), lr=2e-5)

accelerator = Accelerator(fp16=True)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "bert-finetuned-squad-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'shashank1303/bert-finetuned-squad-accelerate'

In [13]:
!sudo apt-get install software-properties-common
!sudo curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash 
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
software-properties-common is already the newest version (0.99.9.8).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
Detected operating system as Ubuntu/focal.
Checking for curl...
Detected curl...
Checking for gpg...
Detected gpg...
Running apt-get update... done.
Installing apt-transport-https... done.
Installing /etc/apt/sources.list.d/github_git-lfs.list...done.
Importing packagecloud gpg key... done.
Running apt-get update... done.

The repository is setup! You can now install packages.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 7168 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 https://packagecloud.io/github/git-lfs/ubuntu focal/main amd64 git-l

In [36]:
output_dir = "bert-finetuned-squad-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

/notebooks/bert-finetuned-squad-accelerate is already a clone of https://huggingface.co/shashank1303/bert-finetuned-squad-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


In [13]:
from tqdm.auto import tqdm
from datasets import load_metric
import numpy as np
import collections

def compute_metrics(start_logits, end_logits, features, examples):
    n_best = 20
    max_answer_length = 30
    predicted_answers = []
    metric = load_metric("squad")
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [25]:
from transformers import PreTrainedModel

In [32]:
class BertForQuestionAnswering(PreTrainedModel):

    def __init__(self,config):
        super().__init__(config)
        self.bert = BertModel.from_pretrained ('bert-base-uncased')
        self.qa_outputs = nn.Linear(768, 2)

    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        token_type_ids = None,
        start_positions = None,
        end_positions = None
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        output = (start_logits, end_logits) + outputs[2:]
        return ((total_loss,) + output) if total_loss is not None else output

In [41]:
model = BertForQuestionAnswering(BertConfig())

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    train_avg_loss = 0
    for step, batch in enumerate(train_dataloader):
        #print(batch)
        outputs = model(**batch)
        loss = outputs[0]
        train_avg_loss += loss.item()
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        if step % 100 == 0:
            print(f"epoch: {epoch} iteration: {step} training loss: {train_avg_loss/(step+1)}")


    # Evaluation
    model.eval()
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(accelerator.gather(outputs[0]).cpu().numpy())
        end_logits.append(accelerator.gather(outputs[1]).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(validation_dataset)]
    end_logits = end_logits[: len(validation_dataset)]

    metrics = compute_metrics(
        start_logits, end_logits, validation_dataset, val
    )
    print(f"epoch {epoch}: , train loss {train_avg_loss/len(train_dataloader)}", metrics)

    # Save and upload
    """
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )
    """

  0%|          | 0/741 [00:00<?, ?it/s]

epoch: 0 iteration: 0 training loss: 6.010009765625
epoch: 0 iteration: 100 training loss: 4.470323014967512
epoch: 0 iteration: 200 training loss: 3.5704570789242265
epoch: 0 iteration: 300 training loss: 3.0654898988844153
epoch: 0 iteration: 400 training loss: 2.7716740634375974
epoch: 0 iteration: 500 training loss: 2.5575946286290945
epoch: 0 iteration: 600 training loss: 2.399202914880635
epoch: 0 iteration: 700 training loss: 2.2754977426243235
Evaluation!


  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/1323 [00:00<?, ?it/s]

epoch 0: , train loss 2.2413433511891023 {'exact_match': 51.70068027210884, 'f1': 66.73202779349897}


In [22]:
test_dataset = test.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=test.column_names,
)
test_set = test_dataset.remove_columns(["example_id", "offset_mapping"])
test_set.set_format("torch")
test_dataloader = DataLoader(
    test_set, collate_fn=default_data_collator, batch_size=8
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_dataloader = accelerator.prepare(
    test_dataloader
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
model.eval()
start_logits = []
end_logits = []
print("Test!")
for batch in tqdm(test_dataloader):
    with torch.no_grad():
        outputs = model(**batch)
     
    start_logits.append(outputs[0].cpu().numpy())
    end_logits.append(outputs[0].cpu().numpy())

start_logits = np.concatenate(start_logits)
end_logits = np.concatenate(end_logits)
start_logits = start_logits[: len(validation_dataset)]
end_logits = end_logits[: len(validation_dataset)]

metrics = compute_metrics(
    start_logits, end_logits, test_dataset, test
)
print(metrics)

Test!


  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/642 [00:00<?, ?it/s]

{'exact_match': 48.13084112149533, 'f1': 64.001094077288}


In [89]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "shashank1303/bert-finetuned-squad-accelerate"
question_answerer = pipeline("question-answering", model=model_checkpoint)

context = """
The course is thaught by sir Chandra Sekhar and one of the assignment is to finetune BERT for question answering. 
My team members are Abhishek, Shashank and Dheeraj.We need to give viva sometime next week and is taken by sir.
"""
questions = ["Names of my team members?","Who is teaching the course?","Who is taking is viva?","What is the task given to students?",
            "What is the assignment?"]
for question in questions:
    print(question_answerer(question=question, context=context))

{'score': 0.9303467273712158, 'start': 136, 'end': 166, 'answer': 'Abhishek, Shashank and Dheeraj'}
{'score': 0.9721232652664185, 'start': 26, 'end': 44, 'answer': 'sir Chandra Sekhar'}
{'score': 0.1526615172624588, 'start': 223, 'end': 226, 'answer': 'sir'}
{'score': 0.12367977201938629, 'start': 95, 'end': 113, 'answer': 'question answering'}
{'score': 0.5629004240036011, 'start': 77, 'end': 113, 'answer': 'finetune BERT for question answering'}
