In [15]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
!pip install transformers
import transformers
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
from tqdm import tqdm



In [16]:
df = pd.read_csv('context_data.csv')

In [17]:
questions = df['question'].tolist()
answers = df['answer'].tolist()

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
input_ids = []
attention_masks = []
start_positions = []
end_positions = []

for question, answer in zip(questions, answers):
    encoded_dict = tokenizer.encode_plus(
        question,
        answer,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_dict['input_ids'].squeeze())
    attention_masks.append(encoded_dict['attention_mask'].squeeze())
    start_positions.append(encoded_dict['input_ids'].squeeze().tolist().index(102))
    end_positions.append(encoded_dict['input_ids'].squeeze().tolist().index(102) + len(encoded_dict['input_ids'].squeeze().tolist()) - 1)


In [20]:
input_ids = torch.stack(input_ids)
attention_masks = torch.stack(attention_masks)
start_positions = torch.tensor(start_positions)
end_positions = torch.tensor(end_positions)

class QADataset(Dataset):
    def __init__(self, input_ids, attention_masks, start_positions, end_positions):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.start_positions = start_positions
        self.end_positions = end_positions
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.start_positions[idx], self.end_positions[idx]


In [21]:
dataset = QADataset(input_ids, attention_masks, start_positions, end_positions)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

batch_size = 8

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [22]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [23]:
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids, attention_masks, start_positions, end_positions = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        start_positions = start_positions.to(device)
        end_positions = end_positions.to(device)

        model.zero_grad()

        outputs = model(input_ids, attention_mask=attention_masks, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss

        if loss is not None:  # Check if loss is valid
            train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f'Training loss: {avg_train_loss}')

    model.eval()
    val_loss = 0.0

    for batch in tqdm(val_dataloader, desc=f'Validation'):
        input_ids, attention_masks, start_positions, end_positions = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        start_positions = start_positions.to(device)
        end_positions = end_positions.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss

            if loss is not None:  # Check if loss is valid
                val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Validation loss: {avg_val_loss}')

Epoch 1/3: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [17:21<00:00, 104.15s/it]


Training loss: 2.2326038002967836


Validation: 100%|████████████████████████████████████████████████████████████████████████| 3/3 [01:53<00:00, 37.72s/it]


Validation loss: 0.9500338633855184


Epoch 2/3: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [18:19<00:00, 109.99s/it]


Training loss: 0.6989322304725647


Validation: 100%|████████████████████████████████████████████████████████████████████████| 3/3 [01:52<00:00, 37.54s/it]


Validation loss: 0.3818645675977071


Epoch 3/3: 100%|███████████████████████████████████████████████████████████████████████| 10/10 [12:36<00:00, 75.62s/it]


Training loss: 0.3811930537223816


Validation: 100%|████████████████████████████████████████████████████████████████████████| 3/3 [00:40<00:00, 13.52s/it]

Validation loss: 0.3491419156392415





In [24]:
model.save_pretrained('bert_qa_model')
tokenizer.save_pretrained('bert_qa_model')

('bert_qa_model\\tokenizer_config.json',
 'bert_qa_model\\special_tokens_map.json',
 'bert_qa_model\\vocab.txt',
 'bert_qa_model\\added_tokens.json')

In [48]:
def answer_question(question,answer):
    encoded_dict = tokenizer.encode_plus(
        question,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # Convert the indices to Python integers
    start_index = start_index.item()
    end_index = end_index.item()

    # Retrieve the original input_ids and attention_mask tensors from the dataset
    original_input_ids = input_ids.squeeze().tolist()
    original_attention_mask = attention_mask.squeeze().tolist()

    # Get the answer span without the [PAD] tokens
    answer = tokenizer.decode(original_input_ids[start_index:end_index + 1], skip_special_tokens=True)

    return answer

In [49]:
question = "What is the review for Nutrigrain Cereal Bars ?"
answer = answer_question(question,answer)
print("Answer:", answer)

Answer: 
