In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Change to the desired directory
import os
os.chdir('/content/drive/MyDrive/Task-1')

In [None]:
# Import necessary libraries
from datasets import load_dataset
import torch
from tqdm.auto import tqdm
from transformers import BertTokenizerFast, BertForQuestionAnswering

In [None]:
# Load SQuAD dataset
dataset = load_dataset('squad')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Add end indices to answers
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        start_idx = answer['answer_start'][0]
        gold_text = answer['text'][0]
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] != gold_text:
            # Try correcting misalignments
            for n in [1, 2]:
                if context[start_idx - n:end_idx - n] == gold_text:
                    start_idx -= n
                    end_idx -= n
                    break
        answer['answer_end'] = end_idx
    return answers

In [None]:
# Prepare the data
def prep_data(dataset):
    contexts = dataset['context']
    questions = dataset['question']
    answers = add_end_idx(dataset['answers'], contexts)
    return {
        'context': contexts,
        'question': questions,
        'answers': answers,
    }

In [None]:
# Process and shuffle data
train_data = prep_data(dataset['train'].shuffle(seed=123).select(range(1000)))

In [None]:
# Tokenize dataset
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize_data(data):
    encodings = tokenizer(
        data['context'], data['question'],
        truncation=True, padding='max_length',
        max_length=512, return_tensors='pt'
    )
    start_positions = []
    end_positions = []
    for i in range(len(data['answers'])):
        # Accessing the first element of the answer_start list
        start = encodings.char_to_token(i, data['answers'][i]['answer_start'][0])
        # Accessing the answer_end, which should be a single integer
        end = encodings.char_to_token(i, data['answers'][i]['answer_end'])
        if start is None:
            start = tokenizer.model_max_length
        if end is None:
            end = tokenizer.model_max_length

        start_positions.append(start)
        end_positions.append(end)
    # Updating encodings outside the loop
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

train_encodings = tokenize_data(train_data)

In [None]:
# Define custom PyTorch dataset
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
# Create dataloader
train_dataset = SquadDataset(train_encodings)
loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
# Prepare the dataset and dataloader
train_dataset = SquadDataset(train_encodings) # Changed 'train' to 'train_encodings'
loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
# Load the pre-trained BERT model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Move model to GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

In [None]:
# Simplified training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 1  # Reduced to 1 epoch for quick execution
batch_size = 1  # Use a smaller batch size to reduce memory usage

# Create a DataLoader with a smaller batch size
loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Simplified and memory-efficient training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(loader):.4f}")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
# Define optimizer and training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)
epochs = 2

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=total_loss / len(loader))

NameError: name 'torch' is not defined