In [39]:
import torch

In [40]:
torch.cuda.is_available()

True

In [41]:
import csv
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForQuestionAnswering, BertTokenizer
import torch.optim.lr_scheduler as lr_scheduler

In [42]:
# Define the dataset
class SquadDataset(Dataset):
    def __init__(self, file_path):
        self.contexts = []
        self.questions = []
        self.answers = []
        self.answer_starts = []
        self.answer_ends = []

        with open(file_path, 'r', encoding='cp1252') as f:
            reader = csv.reader(f)
            next(reader) # skip header row
            for row in reader:
    
                self.contexts.append(row[0])
                self.questions.append(row[1])
                self.answers.append(row[2])
                self.answer_starts.append(int(row[3]))
                self.answer_ends.append(int(row[4]))

    
    def __len__(self):
        return len(self.contexts)

    
    def __getitem__(self, idx):
        return {
            'context': self.contexts[idx],
            'question': self.questions[idx],
            'answer': self.answers[idx],
            'answer_start': self.answer_starts[idx],
            'answer_end': self.answer_ends[idx]
        }

In [43]:
batch_size = 8
num_epochs = 3
learning_rate = 2e-5

In [44]:
train_dataset = SquadDataset('sample_squad_train_data.csv')
print(len(train_dataset))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
print(len(train_loader))

510
64


In [45]:
test_dataset = SquadDataset('sample_squad_test_data.csv')
print(len(test_dataset))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
print(len(test_loader))

200
25


In [55]:
from torch.cuda.amp import autocast, GradScaler

def train(model, data_loader, optimizer, device, ac_stps):
    model.train()
    model.to(device)
    total_loss = 0
    
    scaler = GradScaler()  # initialize the GradScaler object
    bc = 0

    for data in data_loader:
        # Move data to device
        inputs = tokenizer(
            data['context'],
            data['question'],
            return_tensors='pt',
            padding=True,
            truncation=True,
            stride=128,
            max_length=512
        )

        inputs = {key: val.to(device) for key, val in inputs.items()}
        inputs['input_ids']
        inputs['input_ids'].char_to_token(data['answer_start'])
        
        st_pos = data['answer_start'].to(device)
        end_pos = data['answer_end'].to(device)

        # Clear gradients
        optimizer.zero_grad()
        
        with autocast():  # enable automatic mixed precision
            outputs = model(**inputs, st_pos=st_pos, end_pos=end_pos)
            loss = outputs.loss
        
        scaler.scale(loss).backward()  # scale the loss and perform backward pass
        
        bc += 1
        if bc % ac_stps == 0:
            scaler.step(optimizer)  # update the model weights
            scaler.update()  # update the GradScaler for the next iteration
            optimizer.zero_grad()  # clear gradients
            
        lv = loss.item()
        
        if str(lv) == 'nan':
            lv = 0

        total_loss += lv

    if bc % ac_stps != 0:
        scaler.step(optimizer)  # update the model weights
        scaler.update()  # update the GradScaler for the next iteration
        optimizer.zero_grad()  # clear gradients
    
    return total_loss / len(data_loader)

In [56]:
def test(model, data_loader, optimizer, device):    
    valid_loss = 0.0
    model.eval()
    model.to(device)
    
    scaler = GradScaler()  # initialize the GradScaler object
    
    for data in data_loader:
        # Move data to device
        inputs = tokenizer(
            data['context'],
            data['question'],
            return_tensors='pt',
            padding=True,
            truncation=True,
            stride=128,
            max_length=512
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        st_pos = data['answer_start'].to(device)
        end_pos = data['answer_end'].to(device)

        # Clear gradients
        optimizer.zero_grad()
        
        with autocast():  # enable automatic mixed precision
            outputs = model(**inputs, st_pos=st_pos, end_pos=end_pos)
            loss = outputs.loss
        
        lv = loss.item()
        
        if str(lv) == 'nan':
            lv = 0
        
        valid_loss += lv

    return valid_loss / len(data_loader)


In [57]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_length=512)
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Initialize the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Define learning rate scheduler
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

model = model.to(device)
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, device, ac_stps=2)
    test_loss = test(model, test_loader, optimizer, device)
    print(f'Epoch {epoch+1} , train loss {train_loss}, test loss {test_loss}')
    # Update learning rate
    scheduler.step(test_loss)

cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

Epoch 1 , train loss 5.662377052009106, test loss 5.720812129974365
Epoch 2 , train loss 5.437935434281826, test loss 5.669600734710693
Epoch 3 , train loss 5.194169580936432, test loss 5.722679519653321


In [58]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased', max_length=512)
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')

# Initialize the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Define learning rate scheduler
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

model = model.to(device)
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, device, ac_stps=2)
    test_loss = test(model, test_loader, optimizer, device)
    print(f'Epoch {epoch+1} , train loss {train_loss}, test loss {test_loss}')

    scheduler.step(test_loss)


Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on

Epoch 1 , train loss 5.717407636344433, test loss 5.78319902420044
Epoch 2 , train loss 5.684388391673565, test loss 5.789040679931641
Epoch 3 , train loss 5.421862803399563, test loss 5.853338603973389


In [1]:
def predict_answer(model, tokenizer, context, question):
    # Tokenize inputs
    inputs = tokenizer(question, context, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    input_ids = inputs['input_ids'].squeeze()
    
    # Forward pass
    output = model(**inputs)

    # Get predicted answer
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits) + 1
    
    if end_idx < start_idx:
        # Swap the indices if end_idx is less than start_idx
        start_idx, end_idx = end_idx, start_idx
    
    answer = tokenizer.decode(input_ids[start_idx:end_idx])
    return answer

In [20]:
context = "The quick brown fox jumps over the lazy dog."
question = "What does the fox jump over?"
answer = predict_answer(model, tokenizer, context, question)
print(answer)  # Output: "the lazy dog"

tensor(0, device='cuda:0') tensor(10, device='cuda:0')
tensor([  101,  2054,  2515,  1996,  4419,  5376,  2058,  1029,   102,  1996,
         4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,  1012,   102],
       device='cuda:0')
[CLS] what does the fox jump over? [SEP] the
